library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.4
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
This file is for analysis of several coronavirus data sources focused on the US in 2020. Data include areas such as cases, hospitalizations, and deaths tracked to cornavirus, as well as all-cause deaths.
The previous version of this file includes many exploratory analysis components. This version is designed with the following portions of the previous code:
The first section is for analysis of data from The COVID Tracking Project. This file contains data on positive tests, hospitalizations, deaths, and the like for coronavirus cases in the US. Downloaded data are unique by state and date.
The previous analysis made frequent use of the functional form. The functions for downloading and analysizing data from the COVID Tracking Project include:
# Function to extract and format key state data
getStateData <- function(df=usmap::statepop,
renameVars=c("abbr"="state", "full"="name", "pop_2015"="pop"),
keepVars=c("state", "name", "pop")
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing state data
# renameVars: variables to be renamed, using named list with format "originalName"="newName"
# keepVars: variables to be kept in the final file
# Rename variables where appropriate
names(df) <- ifelse(is.na(renameVars[names(df)]), names(df), renameVars[names(df)])
# Return file with only key variables kept
df %>%
select_at(vars(all_of(keepVars)))
}
# Function to read in the raw coronavirus data file (assume it is already downloaded)
readCVData <- function(fileName,
showGlimpse=TRUE,
uqVars=c("date", "state"),
errDups=TRUE
) {
# FUNCTION ARGUMENTS
# fileName: location of the downloded CSV file from COVID Tracking Project
# showGlimpse: boolean, whether to run glimpse() on the file
# uqVars: variables that the file is expected to be unique by
# errDups: boolean, whether to error out if uniqueness is violated
# Read in the file and convert the 'date' from double to date
cvData <- readr::read_csv(fileName) %>%
mutate(date=lubridate::ymd(date))
# See a sample of the data
if (showGlimpse) glimpse(cvData)
# Check that the data are unique by date and state
nDups <- cvData %>%
select_at(vars(all_of(uqVars))) %>%
anyDuplicated()
# Inform of the uniqueness check results
if (nDups==0) {
cat("\nFile is unique by:", uqVars, "and has dimensions:", dim(cvData), "\n")
} else {
cat("\nUniqueness check failed, file has duplicates by:", uqVars, "\n")
if (errDups) stop("Fix and re-run")
}
# Return the file
cvData
}
# Function to select relevant variables and observations, and report on control totals
processCVData <- function(dfFull,
varsKeep=c("date", "state", "positiveIncrease", "deathIncrease"),
varsRename=c("positiveIncrease"="cases", "deathIncrease"="deaths"),
stateList=c(state.abb, "DC")
) {
# FUNCTION ARGUMENTS
# dfFull: the full data file originally loaded
# varsKeep: variables to keep from the full file
# varsRename: variables to be renamed, using a named vector of form originalName=newName
# stateList: variables for filtering state (NULL means do not run any filters)
# Select only the key variables
df <- dfFull %>%
select_at(vars(all_of(varsKeep)))
# Apply the renaming of variables
names(df) <- ifelse(is.na(varsRename[names(df)]), names(df), varsRename[names(df)])
# Designate each record as being either a valid state or not
if (!is.null(stateList)) {
df <- df %>%
mutate(validState=state %in% stateList)
} else {
df <- df %>%
mutate(validState=TRUE)
}
# Summarize the control totals for the data, based on whether the state is valid
cat("\n\nControl totals - note that validState other than TRUE will be discarded\n\n")
df %>%
mutate(n=1) %>%
group_by(validState) %>%
summarize_if(is.numeric, sum) %>%
print()
# Return the file, filtered to where validState is TRUE, and deleting variable validState
df %>%
filter(validState) %>%
select(-validState)
}
# Helper function to create per capita metrics
helperPerCapita <- function(df,
origVar,
newName,
byVar="state",
popVar="pop",
popData=stateData,
mult=1000000
) {
# FUNCTION ARGUMENTS:
# df: the data frame currently being processed
# origVar: the variables to be converted to per capita
# newName: the new per capita variable name
# byVar: the variable that will be merged by
# popVar: the name of the population variable in the popData file
# popData: the file containing the population data
# mult: the multiplier, so that the metric is "per mult people"
# Create the per capita variable
df %>%
inner_join(select_at(popData, vars(all_of(c(byVar, popVar)))), by=byVar) %>%
mutate(!!newName:=mult*get(origVar)/get(popVar)) %>%
select(-all_of(popVar))
}
# Helper function to create rolling aggregates
helperRollingAgg <- function(df,
origVar,
newName,
func=zoo::rollmean,
k=7,
fill=NA,
...
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing the data
# origVar: the original data column name
# newName: the new variable column name
# func: the function to be applied (zoo::rollmean will be by far the most common)
# k: the periodicity (k=7 is rolling weekly data)
# fill: how to fill leading.trailing data to maintain the same vector lengths
# ...: any other arguments to be passed to func
# Create the appropriate variable
df %>%
mutate(!!newName:=func(get(origVar), k=k, fill=fill, ...))
}
# Function to add per capita and rolling to the base data frame
# Updated function to take an arbitrary number of variables and convert them
helperMakePerCapita <- function(df,
mapVars=c("cases"="cpm", "deaths"="dpm"),
popData=stateData,
k=7
) {
# FUNCTION ARGUMENTS:
# df: the initial data frame for conversion
# mapVars: named vector of variables to be converted 'original name'='converted name'
# k: the rolling time period to use
# Create the variables for per capita
for (origVar in names(mapVars)) {
df <- df %>%
helperPerCapita(origVar=origVar, newName=mapVars[origVar], popData=popData)
}
# Arrange the data by date in preparation for rolling aggregations
df <- df %>%
group_by(state) %>%
arrange(date)
# Create the rolling variables
for (newVar in mapVars) {
df <- df %>%
helperRollingAgg(origVar=newVar, newName=paste0(newVar, k), k=k)
}
# Return the updated data frame, ungrouped
df %>%
ungroup()
}
# Function to create side-by-side plots for a deaths and cases metric
# Data in df will be aggregated to be unique by byVar using aggFunc
helperBarDeathsCases <- function(df,
numVars,
title="",
xVar="state",
fillVar=NULL,
aggFunc=sum,
mapper=varMapper
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing the data
# numVars: the relevant numeric variables for plotting
# title: plot title, default is nothing
# xVar: the x-axis variable for plotting
# fillVar: the variable that will color the bars in the final plot (NULL means use "lightblue" for all)
# aggFunc: the aggregate function (will be applied to create data unique by byVar)
# mapper: mapping file to convert x/y variables to descriptive axes (named vector "variable"="label")
# OVERALL FUNCTION PROCESS:
# 1. Variables in numVar are aggregated by aggFunc to be unique by c(xVar, fillVar)
# 2. Data are pivoted longer
# 3. Bar charts are created, with coloring by fillVar if provided
# Create the byVar for summing
byVar <- xVar
if (!is.null(fillVar)) { byVar <- c(byVar, fillVar) }
# Process the data and create the plot
p1 <- df %>%
select_at(vars(all_of(c(byVar, numVars)))) %>%
group_by_at(vars(all_of(byVar))) %>%
summarize_all(aggFunc) %>%
pivot_longer(-all_of(byVar)) %>%
ggplot(aes(x=fct_reorder(get(xVar), value, .fun=min), y=value)) +
coord_flip() +
facet_wrap(~mapper[name], scales="free_x") +
labs(x="", y="", title=title) +
if (is.null(fillVar)) geom_col(fill="lightblue") else geom_col(aes_string(fill=fillVar))
# Print the plot
print(p1)
}
# Function to assess state data (no segments created yet)
assessStateData <- function(df,
titleStem="Coronavirus burden by state",
cfrEst=0.005,
mapper=varMapper
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing the state-level data
# titleStem: the main title description, with (total) or (per capita) appended
# cfrEst: the estimated case fatality rate (CFR); a dashed abline will be plotted at this slope
# mapper: mapping file to convert x/y variables to descriptive axes (named vector "variable"="label")
# Plot cases and deaths by state, once for overall and once per capita
helperBarDeathsCases(df, numVars=c("deaths", "cases"), title=paste0(titleStem, " (total)"))
helperBarDeathsCases(df, numVars=c("dpm", "cpm"), title=paste0(titleStem, " (per capita)"))
# Disease burden by state, per capita
p1 <- df %>%
group_by(state) %>%
summarize(cpm=sum(cpm), dpm=sum(dpm)) %>%
ggplot(aes(x=cpm, y=dpm)) +
geom_text(aes(label=state)) +
labs(x=mapper["cpm"],
y=mapper["dpm"],
title="Deaths vs. cases by state (per million people)",
subtitle=paste0("Dashed line is a CFR of ",
round(100*cfrEst, 1),
"% (states far from this may have case counting issues)"
)
) +
geom_abline(slope=cfrEst, lty=2)
print(p1)
# Total disease burden nationally by day, not using functional form
p2 <- df %>%
select(date, cases, deaths) %>%
group_by(date) %>%
summarize_if(is.numeric, sum) %>%
ungroup() %>%
helperRollingAgg(origVar="cases", newName="casesroll7") %>%
helperRollingAgg(origVar="deaths", newName="deathsroll7") %>%
select(-cases, -deaths) %>%
pivot_longer(-date) %>%
filter(!is.na(value)) %>%
ggplot(aes(x=date, y=value)) +
geom_line() +
facet_wrap(~varMapper[name], scales="free_y") +
labs(x="",
y="",
title=titleStem
)
print(p2)
}
# Function to create an elbow plot for various numbers of clusters in the data
helperElbow <- function(mtx,
testCenters,
iter.max,
nstart,
silhouette=FALSE
) {
# FUNCTION ARGUMENTS:
# mtx: a numeric matrix, or an object that can be coerced to a numeric matrix (no character fields)
# testCenters: integer vector for the centers to be tested
# iter.max: parameter passed to kmeans
# nstart: parameter passed to kmeans
# silhouette: whether to calculate the silhouette score
# Create an object for storing tot.withinss and silhouetteScore
totWithin <- vector("numeric", length(testCenters))
silhouetteScore <- vector("numeric", length(testCenters))
# Create the distancing data (required for silhouette score)
if (silhouette) distData <- dist(mtx)
# Run k-means for every value in testCenters, and store $tot.withinss (and silhouetteScore, if requested)
n <- 1
for (k in testCenters) {
km <- kmeans(mtx, centers=k, iter.max=iter.max, nstart=nstart)
totWithin[n] <- km$tot.withinss
if (silhouette & (k > 1)) silhouetteScore[n] <- mean(cluster::silhouette(km$cluster, distData)[, 3])
n <- n + 1
}
# Create the elbow plot
p1 <- tibble::tibble(n=testCenters, wss=totWithin) %>%
ggplot(aes(x=n, y=wss)) +
geom_point() +
geom_line() +
geom_text(aes(y=wss + 0.05*max(totWithin), x=n+0.2, label=round(wss, 1))) +
labs(x="Number of segments", y="Total Within Sum-Squares", title="Elbow plot") +
ylim(c(0, NA))
# Create the silhouette plot if requested
if (silhouette) {
p2 <- tibble::tibble(n=testCenters, ss=silhouetteScore) %>%
ggplot(aes(x=n, y=ss)) +
geom_point() +
geom_line() +
geom_text(aes(y=ss + 0.05*max(silhouetteScore), x=n+0.2, label=round(ss, 1))) +
labs(x="Number of segments", y="Mean silhouette width", title="Silhouette plot") +
ylim(c(-1, NA))
gridExtra::grid.arrange(p1, p2, nrow=1)
} else {
print(p1)
}
}
# Function to create clusters for the state data (requires all data from same year, as currently true)
clusterStates <- function(df,
caseVar="cpm",
deathVar="dpm",
shapeFunc=lubridate::month,
minShape=NULL,
minDeath=0,
minCase=0,
ratioTotalvsShape=1,
ratioDeathvsCase=1,
hierarchical=TRUE,
hierMethod="complete",
nCenters=3,
iter.max=10,
nstart=1,
testCenters=NULL,
returnList=FALSE,
seed=NULL
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing cases and deaths data
# caseVar: the variable containing the cases per capita data
# deathVar: the variable containing the deaths per capita data
# shapeFunc: the function to be used for creating the shape of the curve
# minShape: the minimum value to be used for shape (to avoid very small amounts of data in Jan/Feb)
# NULL means keep everything
# minDeath: use this value as a floor for the death metric when calculating shape
# minCase: use this metric as a floor for the case metric when calculating shape
# ratioTotalvsShape: amount of standard deviation to be kept in total variable vs shape variables
# ratioDeathvsCase: amount of standard deviation to be kept in deaths vs cases
# (total death data will be scaled to have sd this many times higher than cases)
# (death percentages by time period will be scaled directly by this amount)
# hierarchical: boolean, if TRUE run hierarchical clustering, otherwise run k-means clustering
# only hierarchical clustering is currently implemented
# hierMethod: the method for hierarchical clustering (e.g., 'complete' or 'single')
# nCenters: the number of centers to use for kmeans clustering
# testCenters: integer vector of centers to test (will create an elbow plot); NULL means do not test
# iter.max: maximumum number of kmeans iterations (default in kmeans algorithm is 10)
# nstart: number of random sets chosen for kmeans (default in kmeans algorithm is 1)
# returnList: boolean, if FALSE just the cluster object is returned
# if TRUE, a list is returned with dfCluster and the cluster object
# seed: set the seed to this value (NULL means no seed)
# Extract key information (aggregates and by shapeFunc for each state)
df <- df %>%
select_at(vars(all_of(c("date", "state", caseVar, deathVar)))) %>%
purrr::set_names(c("date", "state", "cases", "deaths")) %>%
mutate(timeBucket=shapeFunc(date)) %>%
group_by(state, timeBucket) %>%
summarize(cases=sum(cases), deaths=sum(deaths)) %>%
ungroup()
# Limit to only relevant time buckets if requested
if (!is.null(minShape)) {
df <- df %>%
filter(timeBucket >= minShape)
}
# Extract an aggregate by state, scaled so that they have the proper ratio
dfAgg <- df %>%
group_by(state) %>%
summarize(totalCases=sum(cases), totalDeaths=sum(deaths)) %>%
ungroup() %>%
mutate(totalDeaths=ratioDeathvsCase*totalDeaths*sd(totalCases)/sd(totalDeaths))
# Extract the percentages (shapes) by month, scaled so that they have the proper ratio
dfShape <- df %>%
pivot_longer(-c(state, timeBucket)) %>%
group_by(state, name) %>%
mutate(tot=pmax(sum(value), ifelse(name=="deaths", minDeath, minCase)),
value=ifelse(name=="deaths", ratioDeathvsCase, 1) * value / tot) %>%
select(-tot) %>%
pivot_wider(state, names_from=c(name, timeBucket), values_from=value) %>%
ungroup()
# Function to calculate SD of a subset of columns
calcSumSD <- function(df) {
df %>%
ungroup() %>%
select(-state) %>%
summarize_all(.funs=sd) %>%
as.vector() %>%
sum()
}
# Down-weight the aggregate data so that there is the proper sum of sd in aggregates and shapes
aggSD <- calcSumSD(dfAgg)
shapeSD <- calcSumSD(dfShape)
dfAgg <- dfAgg %>%
mutate_if(is.numeric, ~. * ratioTotalvsShape * shapeSD / aggSD)
# Combine so there is one row per state
dfCluster <- dfAgg %>%
inner_join(dfShape, by="state")
# convert 'state' to rowname
keyData <- dfCluster %>% column_to_rownames("state")
# Create hierarchical segments or kmeans segments
if (hierarchical) {
objCluster <- hclust(dist(keyData), method=hierMethod)
plot(objCluster)
} else {
# Create an elbow plot if testCenters is not NULL
if (!is.null(testCenters)) {
helperElbow(keyData, testCenters=testCenters, iter.max=iter.max, nstart=nstart, silhouette=TRUE)
}
# Create the kmeans cluster object, setting a seed if requested
if (!is.null(seed)) set.seed(seed)
objCluster <- kmeans(keyData, centers=nCenters, iter.max=iter.max, nstart=nstart)
cat("\nCluster means and counts\n")
n=objCluster$size %>% cbind(objCluster$centers) %>% round(2) %>% t() %>% print()
}
# Return the data and object is a list if returnList is TRUE, otherwise return only the clustering object
if (!isTRUE(returnList)) {
objCluster
} else {
list(objCluster=objCluster, dfCluster=dfCluster)
}
}
# Helper function to assess 30-day change vs. total
helperRecentvsTotal <- function(df,
xVar,
yVar,
title,
recencyDays=30,
ablineSlope=0.5,
mapper=varMapper,
labelPlot=TRUE,
printPlot=TRUE
) {
# FUNCTION ARGUMENTS:
# df: the tibble containing data by state by day
# xVar: the x-variable
# yVar: the y-variable
# title: the plot title
# recencyDays: number of days to consider as recent
# ablineSlope: dashed line will be drawn with this slope and intercept 0
# mapper: mapping file to convert x/y variables to descriptive axes (named vector "variable"="label")
# labelPlot: boolean, whether to show the labels for each point
# printPlot: boolean, whether to display the plot (if FALSE, plot object is returned)
# Get the most date cutoff
dateCutoff <- df %>% pull(date) %>% max() - recencyDays + 1
cat("\nRecency is defined as", format(dateCutoff, "%Y-%m-%d"), "through current\n")
# Create the plot
p1 <- df %>%
mutate(newCases=ifelse(date >= dateCutoff, cases, 0),
newDeaths=ifelse(date >= dateCutoff, deaths, 0),
newcpm=ifelse(date >= dateCutoff, cpm, 0),
newdpm=ifelse(date >= dateCutoff, dpm, 0)
) %>%
group_by(state, cluster) %>%
summarize_if(is.numeric, .funs=sum) %>%
ungroup() %>%
ggplot(aes_string(x=xVar, y=yVar)) +
labs(x=mapper[xVar],
y=mapper[yVar],
title=title,
subtitle=paste0("Dashed line represents ",
round(100*ablineSlope),
"% of total is new in last ",
recencyDays,
" days"
)
) +
geom_abline(lty=2, slope=ablineSlope) +
lims(x=c(0, NA), y=c(0, NA)) +
theme(legend.position = "bottom")
# Add the appropriate geom (scatterplot if labelPlot is FALSE)
if (labelPlot) p1 <- p1 + geom_text(aes(color=cluster, label=state))
else p1 <- p1 + geom_point(aes(color=cluster), alpha=0.5)
if (isTRUE(printPlot)) {
print(p1)
} else {
p1
}
}
# Function to plot cluster vs. individual elements on a key metric
helperTotalvsElements <- function(df,
keyVar,
title,
aggAndTotal=TRUE,
pctRibbon=0.8,
aggFunc=if(aggAndTotal) median else mean,
mapper=varMapper,
facetScales="free_y",
printPlot=TRUE
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing n-day rolling averages
# keyVar: the variable to be plotted
# title: the plot title
# aggAndTotal: boolean, whether to plot every individual observation along with the cluster aggregate
# pctRibbon: if aggAndTotal is FALSE, a ribbon covering this percentage of the data will be plotted
# aggFunc: how to aggregate the elements to the segment
# CAUTION that this is an aggregate of averages, rather than a population-weighted aggregate
# mapper: the variable mapping file to get the appropriate label for keyVar
# facetScales: the scaling for the facets - "free_y" to let them all float, "fixed" to have them the same
# printPlot: boolean, if TRUE print the plot (otherwise return the plot object)
# Create an appropriate subtitle
subtitle <- if(facetScales=="free_y") {
"Caution that each facet has its own y axis with different scales"
} else if (facetScales=="fixed") {
"All facets are on the same scale"
} else {
"Update subtitle code in function helperTotalvsElements"
}
# Create an appropriate caption
caption <- if(aggAndTotal) {
"Cluster aggregate is median, weighting each observation equally\n(NOT population-weighted)"
} else {
paste0("1. Cluster aggregate is mean over all observations (NOT population-weighted)\n2. Ribbons reflect range covering ", round(pctRibbon*100), "% of observations")
}
# Create the plots for segment-level data
p1 <- df %>%
rbind(mutate(., state="cluster")) %>%
group_by(state, cluster, date) %>%
summarize_at(vars(all_of(keyVar)), .funs=aggFunc) %>%
ungroup() %>%
filter(!is.na(get(keyVar))) %>%
ggplot(aes_string(x="date")) +
geom_line(data=~filter(., state == "cluster"),
aes(y=get(keyVar), group=state, color=cluster),
lwd=1.5
) +
facet_wrap(~cluster, scales=facetScales) +
labs(x="",
y=mapper[keyVar],
title=title,
subtitle=subtitle,
caption=caption
) +
ylim(c(0, NA)) +
theme(legend.position="bottom")
# Add an appropriate individual metric (either every observation or quantiles)
if (aggAndTotal) {
p1 <- p1 +
geom_line(data=~filter(., state != "cluster"),
aes(y=get(keyVar), group=state),
alpha=0.25
)
} else {
dfRibbon <- df %>%
filter(!is.na(get(keyVar))) %>%
group_by(cluster, date) %>%
summarize(ylow=quantile(get(keyVar), 0.5-0.5*pctRibbon),
yhigh=quantile(get(keyVar), 0.5+0.5*pctRibbon)
)
p1 <- p1 +
geom_ribbon(data=dfRibbon,
aes(ymin=ylow, ymax=yhigh),
alpha=0.25
)
}
# Print plot if requested, otherwise return it
if (isTRUE(printPlot)) {
print(p1)
} else {
p1
}
}
# Function to assess clusters
assessClusters <- function(clusters,
dfState=stateData,
dfBurden=cvFilteredPerCapita,
thruLabel="Aug 20, 2020",
isCounty=FALSE,
plotsTogether=FALSE,
clusterPlotsTogether=plotsTogether,
recentTotalTogether=plotsTogether,
clusterAggTogether=plotsTogether
) {
# FUNCTION ARGUMENTS:
# clusters: the named vector containing the clusters by state
# dfState: the file containing the states and populations
# dfBurden: the data containing the relevant per capita burden statistics by state-date
# thruLabel: label for plots for 'data through'
# isCounty: boolean, is this a plot of county-level data that have been named 'state'?
# FALSE means that it is normal state-level data
# plotsTogether: boolean, should plots be consolidated on fewer pages?
# clusterPlotsTogether: boolean, should plots p1-p4 be consolidated?
# recentTotalTogether: boolean, should recent total plots p7-p8 be consolidated?
# clusterAggTogether: boolean, should aggregate plots p9/p11 and p10/p12 be consolidated?
# Attach the clusters to the state population data
dfState <- as.data.frame(clusters) %>%
set_names("cluster") %>%
rownames_to_column("state") %>%
inner_join(dfState, by="state") %>%
mutate(cluster=factor(cluster))
# Plot the segments on a state map (only if !isCounty)
if (isCounty) {
p1 <- dfState %>%
count(cluster) %>%
ggplot(aes(x=fct_rev(cluster), y=n)) +
geom_col(aes(fill=cluster)) +
geom_text(aes(y=n/2, label=n)) +
coord_flip() +
labs(x="", y="# Counties", title="Membership by segment")
} else {
p1 <- usmap::plot_usmap(regions="states", data=dfState, values="cluster") +
scale_fill_discrete("cluster") +
theme(legend.position="right")
}
# Plot the population totals by segment
# Show population totals by cluster
p2 <- dfState %>%
group_by(cluster) %>%
summarize(pop=sum(pop)/1000000) %>%
ggplot(aes(x=fct_rev(cluster), y=pop)) +
geom_col(aes(fill=cluster)) +
geom_text(aes(y=pop/2, label=round(pop))) +
labs(y="2015 population (millions)", x="Cluster", title="Population by cluster (millions)") +
coord_flip()
# Plot the rolling 7-day mean dialy disease burden by cluster
dfPlot <- dfState %>%
inner_join(dfBurden, by="state") %>%
tibble::as_tibble()
# Plot the rolling 7-day mean daily disease burden by cluster
p3 <- dfPlot %>%
select(date, cluster, cases=cpm7, deaths=dpm7) %>%
pivot_longer(-c(date, cluster)) %>%
filter(!is.na(value)) %>%
group_by(date, cluster, name) %>%
summarize(value=median(value)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=cluster, color=cluster)) +
facet_wrap(~name, scales="free_y") +
labs(x="",
y="Rolling 7-day mean, per million",
title="Rolling 7-day mean daily disease burden, per million",
subtitle="Median per day for states assigned to cluster"
)
# Plot the total cases and total deaths by cluster
p4 <- dfPlot %>%
group_by(cluster) %>%
summarize(cases=sum(cases), deaths=sum(deaths)) %>%
pivot_longer(-cluster) %>%
ggplot(aes(x=fct_rev(cluster), y=value/1000)) +
geom_col(aes(fill=cluster)) +
geom_text(aes(y=value/2000, label=round(value/1000))) +
coord_flip() +
facet_wrap(~varMapper[name], scales="free_x") +
labs(x="Cluster", y="Burden (000s)", title="Total cases and deaths by segment")
# Place the plots together if plotsTogether is TRUE, otherwise just print
if (isTRUE(clusterPlotsTogether)) {
gridExtra::grid.arrange(p1, p2, p3, p4, nrow=2, ncol=2)
} else {
print(p1); print(p2); print(p3); print(p4)
}
# These are relevant and useful only for state-level data
if (!isCounty) {
# Plot total cases and total deaths by state, colored by cluster
helperBarDeathsCases(dfPlot,
numVars=c("cases", "deaths"),
title=paste0("Coronavirus impact by state through ", thruLabel),
xVar=c("state"),
fillVar=c("cluster")
)
# Plot cases per million and deaths per million by state, colored by cluster
helperBarDeathsCases(dfPlot,
numVars=c("cpm", "dpm"),
title=paste0("Coronavirus impact by state through ", thruLabel),
xVar=c("state"),
fillVar=c("cluster")
)
}
# County-level plots will be point-only; state-level plots will be labelled
# Plot last-30 vs total for cases per million by state, colored by cluster
p7 <- helperRecentvsTotal(dfPlot,
xVar="cpm",
yVar="newcpm",
title=paste0("Coronavirus burden through ", thruLabel),
labelPlot=!isCounty,
printPlot=FALSE
)
# Plot last-30 vs total for deaths per million by state, colored by cluster
p8 <- helperRecentvsTotal(dfPlot,
xVar="dpm",
yVar="newdpm",
title=paste0("Coronavirus burden through ", thruLabel),
labelPlot=!isCounty,
printPlot=FALSE
)
# Print the plots either as a single page or separately
if (isTRUE(recentTotalTogether)) {
gridExtra::grid.arrange(p7, p8, nrow=1)
} else {
print(p7); print(p8)
}
# These are currently only helpful for states (update later to make more useful for counties)
# Plot the cases per million on a free y-scale and a fixed y-scale
p9 <- helperTotalvsElements(dfPlot,
keyVar="cpm7",
aggAndTotal=!isCounty,
title="Cases per million, 7-day rolling mean",
printPlot=FALSE
)
p10 <- helperTotalvsElements(dfPlot,
keyVar="cpm7",
aggAndTotal=!isCounty,
title="Cases per million, 7-day rolling mean",
facetScales="fixed",
printPlot=FALSE
)
# Plot the deaths per million on a free y-scale and a fixed y-scale
p11 <- helperTotalvsElements(dfPlot,
keyVar="dpm7",
aggAndTotal=!isCounty,
title="Deaths per million, 7-day rolling mean",
printPlot=FALSE
)
p12 <- helperTotalvsElements(dfPlot,
keyVar="dpm7",
aggAndTotal=!isCounty,
title="Deaths per million, 7-day rolling mean",
facetScales="fixed",
printPlot=FALSE
)
if (isTRUE(clusterAggTogether)) {
gridExtra::grid.arrange(p9, p11, nrow=1)
gridExtra::grid.arrange(p10, p12, nrow=1)
} else {
print(p9); print(p10); print(p11); print(p12)
}
# Return the plotting data frame
dfPlot
}
# Function to create plots of the number hospitalized by state and cluster
plotHospitalized <- function(df,
clusterVector,
dfState=stateData,
subT=""
) {
# FUNCTION ARGUMENTS:
# df: a data frame or tibble containing 'state', 'date', 'hospitalizedCurrently'
# clusterVector: a named vector of form 'state'='cluster'
# dfState: a state-level population file containing 'state' and 'pop'
# subT: a subtitle for the plot
# Create the key plotting data
plotData <- df %>%
inner_join(dfState, by="state") %>%
mutate(cluster=factor(clusterVector[state])) %>%
filter(!is.na(hospitalizedCurrently)) %>%
select(date, state, cluster, hospitalizedCurrently, pop) %>%
rbind(mutate(., state="Total")) %>%
group_by(state, cluster, date) %>%
summarize(n=n(),
hospitalizedCurrently=sum(hospitalizedCurrently),
pop=sum(pop)
) %>%
mutate(hpm=1000000*hospitalizedCurrently/pop) %>%
helperRollingAgg(origVar="hpm", newName="hpm7") %>%
ungroup()
# Create the plot
p1 <- plotData %>%
filter(!is.na(hpm7)) %>%
ggplot(aes(x=date, y=hpm7)) +
geom_line(data=~filter(., state != "Total"), aes(group=state), alpha=0.25) +
geom_line(data=~filter(., state == "Total"), aes(group=state, color=cluster), lwd=1.5) +
facet_wrap(~cluster, scales="fixed") +
ylim(c(0, NA)) +
labs(x="",
y="Currently Hospitalized 7-day rolling mean (per million)",
title="Hospitalized per million by cluster",
subtitle=subT
)
print(p1)
# Return the plot data
plotData
}
# Function to download data for COVID Tracking Project
downloadCOVIDbyState <- function(fileName,
api="https://api.covidtracking.com/v1/states/daily.csv",
ovrWrite=FALSE
) {
# COVID Tracking Project API allows data downloads for personal, non-commercial use
# https://covidtracking.com/data/api
# FUNCTION ARGUMENTS:
# fileName: the filename that the data will be saved to
# api: The API link for data downloads
# ovrWrite: whether to allow overwriting of the existing fileName
# Check whether fileName already exists
if (file.exists(fileName)) {
cat("\nFile already exists at:", fileName, "\n")
if (ovrWrite) cat("Will over-write with current data from", api, "\n")
else stop("Exiting due to ovrWrite=FALSE and a duplicate fileName\n")
}
# Download the file
download.file(api, destfile=fileName)
# Show statistics on downloaded file
file.info(fileName)
}
# Function to read, convert, and sanity check a downloaded file
readCOViDbyState <- function(fileName,
checkFile=NULL,
controlFields=c("positiveIncrease", "deathIncrease", "hospitalizedCurrently"),
controlBy=c("state")
) {
# FUNCTION ARGUMENTS:
# fileName: the file name for reading the data
# checkFile: a file that can be used for comparison purposes (NULL means do not compare to anything)
# controlFields: fields that will be explicitly checked against checkFile
# controlBy: level of aggregation at which fields will be explicitly checked against checkFile
# Helper function to check for similarity of key elements
helperSimilarity <- function(newData, refData, label) {
cat("\n\nCheckin for similarity of:", label)
cat("\nIn reference but not in current:", setdiff(refData, newData))
cat("\nIn current but not in reference:", setdiff(newData, refData))
}
# Read in the file and convert the numeric date field to date using ymd format
df <- readr::read_csv(fileName) %>%
mutate(date=lubridate::ymd(date))
# Check that the file is unique by date-state
if ((df %>% select(date, state) %>% anyDuplicated()) != 0) {
stop("\nDuplicates by date and state, investigate and fix\n")
} else {
cat("\nFile is unique by state and date\n")
}
# Check for overall control totals in new file
cat("\n\nOverall control totals in file:\n")
df %>%
summarize_at(vars(all_of(controlFields)), .funs=sum, na.rm=TRUE) %>%
print()
# Get control totals by date for new file
dfByDate <- df %>%
group_by(date) %>%
summarize_at(vars(all_of(controlFields)), .funs=sum, na.rm=TRUE) %>%
ungroup() %>%
pivot_longer(-date, values_to="newValue")
# If there is no checkFile, then just produce a plot of the key metrics
if (is.null(checkFile)) {
p1 <- dfByDate %>%
ggplot(aes(x=date, y=newValue)) +
geom_line() +
facet_wrap(~name, nrow=1, scales="free_y") +
labs(title="Control totals by date for new file (no reference file)", x="", y="Summed Value")
print(p1)
} else {
# Check for similarity of fields, dates, and states
cat("\n*** COMPARISONS TO REFERENCE FILE:", deparse(substitute(checkFile)))
helperSimilarity(newData=names(df), refData=names(checkFile), label="column names")
helperSimilarity(newData=df %>% pull(state) %>% unique(),
refData=checkFile %>% pull(state) %>% unique(),
label="states"
)
helperSimilarity(newData=df %>% pull(date) %>% unique() %>% format("%Y-%m-%d"),
refData=checkFile %>% pull(date) %>% unique() %>% format("%Y-%m-%d"),
label="dates"
)
# Check for similarity of control totals by date in files
checkByDate <- checkFile %>%
group_by(date) %>%
summarize_at(vars(all_of(controlFields)), .funs=sum, na.rm=TRUE) %>%
ungroup() %>%
pivot_longer(-date, values_to="oldValue")
cat("\n\n*** Difference of at least 5 and difference is at least 1%:\n\n")
dfByDate %>%
inner_join(checkByDate) %>%
filter(abs(newValue-oldValue)>=5,
pmax(newValue, oldValue)>=1.01*pmin(newValue, oldValue)
) %>%
as.data.frame() %>%
print()
p1 <- dfByDate %>%
inner_join(checkByDate) %>%
pivot_longer(-c(date, name), names_to="newOld") %>%
ggplot(aes(x=date, y=value, group=newOld, color=newOld)) +
geom_line() +
facet_wrap(~name, nrow=1, scales="free_y") +
labs(title="Control totals by date for new and reference file", x="", y="Summed Value")
print(p1)
# Check for similarity of control totals by controlBy in files
dfByControl <- df %>%
semi_join(select(checkFile, date), by="date") %>%
group_by_at(vars(all_of(controlBy))) %>%
summarize_at(vars(all_of(controlFields)), .funs=sum, na.rm=TRUE) %>%
ungroup() %>%
pivot_longer(-all_of(controlBy), values_to="newValue")
checkByControl <- checkFile %>%
group_by_at(vars(all_of(controlBy))) %>%
summarize_at(vars(all_of(controlFields)), .funs=sum, na.rm=TRUE) %>%
ungroup() %>%
pivot_longer(-all_of(controlBy), values_to="oldValue")
cat("\n\n*** Difference of at least 5 and difference is at least 1%:\n\n")
dfByControl %>%
inner_join(checkByControl) %>%
filter(abs(newValue-oldValue)>=5,
pmax(newValue, oldValue)>=1.01*pmin(newValue, oldValue)
) %>%
as.data.frame() %>%
print()
}
# Return the processed data file
df
}
# Function to create plots of consolidated metrics
plotConsolidatedMetrics <- function(dfMain,
dfHosp=NULL,
varMain=c("state", "cluster", "date", "pop", "cases", "deaths", "hosp"),
varDropHosp=c("n", "pop"),
joinBy=c("state", "cluster", "date"),
subT=NULL,
nrowPlot2=1
) {
# FUNCTION ARGUMENTS:
# dfMain: the main file produced by assessClusters(), containing data by state-cluster-date
# dfHosp: the separate file with hospital data (NULL means data already available in dfMain)
# varMain: variables to keep from dfMain
# varDropHosp: variables to drop from dfHosp
# joinBy: variables for joining dfMain and dfHosp
# subT: plot subtitle (NULL will use the defaults),
# nrowPlot2: number of rows for the facetting to use on plot 2
if (is.null(subT)) {
subT <- "Cases: new cases, Deaths: new deaths, Hosp: total in hospital (not new)"
}
# Filter dfMain to include only variables in varMain
dfMain <- dfMain %>%
select_at(vars(all_of(varMain)))
# Left join dfMain to dfHosp unless dfHosp is NULL
if (!is.null(dfHosp)) {
dfHosp <- dfHosp %>%
select_at(vars(all_of(names(dfHosp)[!(names(dfHosp) %in% varDropHosp)])))
dfMain <- dfMain %>%
left_join(dfHosp, by=all_of(joinBy))
}
# Check that variables state, cluster, date, pop are all available
if (!(c("state", "cluster", "date", "pop") %in% names(dfMain) %>% all())) {
stop("\nFunction requires variables state, cluster, date, and pop after processing\n")
}
# Create the relevant plotting data
dfPlot <- dfMain %>%
pivot_longer(-c(state, cluster, date, pop)) %>%
filter(!is.na(value)) %>%
rbind(mutate(., state="cluster")) %>%
group_by_at(vars(all_of(c(joinBy, "name")))) %>%
summarize(value=sum(value), pop=sum(pop)) %>%
mutate(vpm=1000000*value/pop) %>%
arrange(state, cluster, name, date) %>%
group_by(state, cluster, name) %>%
helperRollingAgg(origVar="vpm", newName="vpm7")
# Create facetted plots for totals by metric by segment
p1 <- dfPlot %>%
filter(!is.na(vpm7)) %>%
ggplot(aes(x=date, y=vpm7)) +
geom_line(data=~filter(., state=="cluster"), aes(group=cluster, color=cluster), lwd=1.5) +
geom_line(data=~filter(., state!="cluster"), aes(group=state), alpha=0.25) +
facet_grid(name ~ cluster, scales="free_y") +
labs(x="",
y="Rolling 7-day mean per million",
title="Key metrics by cluster (7-day rolling mean per million)",
subtitle=subT
) +
scale_x_date(date_breaks="1 months", date_labels="%b") +
theme(axis.text.x=element_text(angle=90))
print(p1)
# Create all-segment plot by metric
p2 <- dfPlot %>%
filter(!is.na(vpm7)) %>%
ggplot(aes(x=date, y=vpm7)) +
geom_line(data=~filter(., state=="cluster"), aes(group=cluster, color=cluster), lwd=1.5) +
facet_wrap(~ name, scales="free_y", nrow=nrowPlot2) +
labs(x="",
y="Rolling 7-day mean per million",
title="Key metrics by cluster (7-day rolling mean per million)",
subtitle=subT
) +
scale_x_date(date_breaks="1 months", date_labels="%b") +
theme(axis.text.x=element_text(angle=90))
print(p2)
# Create all-metric plot by segment (define 100% as peak for segment-metric)
p3 <- dfPlot %>%
filter(!is.na(vpm7)) %>%
group_by(state, cluster, name) %>%
mutate(spm7=vpm7/max(vpm7)) %>%
ggplot(aes(x=date, y=spm7)) +
geom_line(data=~filter(., state=="cluster"), aes(group=name, color=name), lwd=1) +
facet_wrap(~ cluster, scales="free_y") +
labs(x="",
y="% of Maximum",
title="Key metrics by cluster (% of cluster maximum for variable)",
subtitle=subT
) +
scale_x_date(date_breaks="1 months", date_labels="%b") +
scale_color_discrete("variable") +
theme(axis.text.x=element_text(angle=90))
print(p3)
# Return the plotting data
dfPlot
}
# Function to convert a file to cumulative totals
makeCumulative <- function(df,
typeVar="name",
typeKeep=c("cases", "deaths", "tests"),
valVar="vpm7",
groupVars=c("state", "cluster", "name"),
arrangeVars=c("date"),
newName="cum7"
) {
# FUNCTION ARGUMENTS:
# df: data frame containing the metrics
# typeVar: the variable holding the metric type (default is 'name')
# typeKeep: the values of typeVar to be kept
# valVar: the variable holding the metric value (default is 'vpm7')
# groupVars: groups for calculating cumulative sum
# arrangeVars: variables to be sorted by before calculating cumulative sum
# newName: the name for the new variable
# Create the cumulative data
dfCum <- df %>%
filter(get(typeVar) %in% typeKeep, !is.na(get(valVar))) %>%
arrange_at(vars(all_of(c(groupVars, arrangeVars)))) %>%
group_by_at(groupVars) %>%
mutate(!!newName:=cumsum(get(valVar))) %>%
ungroup()
# Return the processed data
dfCum
}
# Function to find and flag states that are high on a key value or change in key value
findFlagStates <- function(df,
keyMetricVal,
keyMetricVar="name",
cumVar="cum7",
prevDays=30,
nFlag=5
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing the cumulative data
# keyMetricVal: the metric of interest (e.g., "deaths", "cases", "tests")
# keyMetricVar: the variable name for the column containing the metric of interest
# cumVar: variable containing the cumulative totals
# prevDays: the number of days previous to use for defining growth
# nFlag: the number of states to flag for either total or growth (top-n of each)
# Find top-5 in either total or recent increase
dfFlag <- df %>%
filter(get(keyMetricVar)==keyMetricVal, state!="cluster") %>%
select_at(vars(all_of(c("state", "date", cumVar)))) %>%
group_by(state) %>%
summarize(maxVal=max(get(cumVar)),
tminus=sum(ifelse(date==max(date)-lubridate::days(prevDays), get(cumVar), 0))
) %>%
ungroup() %>%
mutate(growth=maxVal-tminus,
rkTotal=min_rank(-maxVal),
rkGrowth=min_rank(-growth),
flag=ifelse(pmin(rkTotal, rkGrowth)<=nFlag, 1, 0)
) %>%
arrange(-flag, rkTotal)
# Return the top values as a vector of states
dfFlag %>%
filter(flag==1) %>%
pull(state)
}
# Function to plot cumulative data
plotCumulativeData <- function(df,
keyMetricp2,
flagsp2,
p2Desc=keyMetricp2,
keyVar="cum7",
makep1=FALSE,
makep2=TRUE
) {
# FUNCTION ARGUMENTS:
# df: the data frame of cumulative data
# keyMetricp2: the key metric to be plotted in the second plot (e.g., "deaths", "cases", "tests")
# flagsp2: states to be treated as flagged in the second plot
# p2Desc: the description to be used in plot 2
# keyVar: the key variable to plot
# makep1: boolean, whether to make the first plot
# makep2: boolean, whether to make the second plot
# Plot the cumulative data by cluster (if flag is set for producing this)
if (isTRUE(makep1)) {
p1 <- df %>%
filter(state=="cluster") %>%
ggplot(aes(x=date, y=get(keyVar))) +
geom_line(aes(group=cluster, color=cluster)) +
facet_wrap(~name, nrow=1, scales="free_y") +
scale_x_date(date_breaks="1 months", date_labels="%m") +
labs(x="Month",
y="Cumulative Burden (per million)",
title="Cumulative burden by segment (per million)"
)
print(p1)
}
# Plot the cumulative totals over time for one metric, and flag the highest
if (isTRUE(makep2)) {
p2 <- df %>%
filter(state!="cluster", name==keyMetricp2) %>%
mutate(bold=ifelse(state %in% flagsp2, 1, 0)) %>%
ggplot(aes(x=date, y=get(keyVar))) +
geom_line(aes(group=state, color=cluster, alpha=0.4+0.6*bold, size=0.5+0.5*bold)) +
geom_text(data=~filter(., bold==1, date==max(date)),
aes(x=date+lubridate::days(10),
label=paste0(state, ": ", round(get(keyVar), 0)),
color=cluster
),
size=3,
fontface="bold"
) +
scale_x_date(date_breaks="1 months", date_labels="%m") +
scale_alpha_identity() +
scale_size_identity() +
labs(x="Month",
y=paste0("Cumulative ", p2Desc, " (per million)"),
title=paste0("Cumulative coronavirus ", p2Desc, " by state (per million)"),
subtitle="Top 5 states for total and growth rate are bolded and labelled"
)
print(p2)
}
}
The data from COVID Tracking Project can be loaded and analyzed using the functions above, a variable mapping file, and a main function that calls the other functions and returns a list:
# STEP 0: Create a variable mapping file
varMapper <- c("cases"="Cases",
"newCases"="Increase in cases, most recent 30 days",
"casesroll7"="Rolling 7-day mean cases",
"deaths"="Deaths",
"newDeaths"="Increase in deaths, most recent 30 days",
"deathsroll7"="Rolling 7-day mean deaths",
"cpm"="Cases per million",
"cpm7"="Cases per day (7-day rolling mean) per million",
"newcpm"="Increase in cases, most recent 30 days, per million",
"dpm"="Deaths per million",
"dpm7"="Deaths per day (7-day rolling mean) per million",
"newdpm"="Increase in deaths, most recent 30 days, per million",
"hpm7"="Currently Hospitalized per million (7-day rolling mean)",
"tpm"="Tests per million",
"tpm7"="Tests per million per day (7-day rolling mean)"
)
# Function to download/load, process, segment, and analyze data from COVID Tracking Project
readRunCOVIDTrackingProject <- function(thruLabel,
downloadTo=NULL,
readFrom=downloadTo,
compareFile=NULL,
dfPerCapita=NULL,
useClusters=NULL,
hierarchical=TRUE,
returnList=!hierarchical,
kCut=6,
reAssignState=vector("list", 0),
makeCumulativePlots=TRUE,
...
) {
# FUNCTION ARGUMENTS:
# thruLabel: the label for when the data are through (e.g., "Aug 30, 2020")
# donwloadTo: download the most recent COVID Tracking Project data to this location
# NULL means do not download any data
# readFrom: location for reading in the COVID Tracking Project data (defaults to donwloadTo)
# compareFile: name of the file to use for comparisons when reading in raw data (NULL means no comparison)
# dfPerCapita: file can be passed directly, which bypasses the loading and processing steps
# useClusters: file containing clusters by state (NULL means make the clusters from the data)
# hierarchical: boolean, should hierarchical clusters be produced (if FALSE, will be k-means)?
# returnList: boolean, should a list be returned or just the cluster object?
# kCut: number of segments when cutting the hierarchical tree
# reAssignState: mapping file for assigning a state to another state's cluster
# format list("stateToChange"="stateClusterToAssign")
# makeCumulativePlots: whether to make plots of cumulative metrics
# ...: arguments to be passed to clusterStates(), will be used only if useClusters is NULL
# STEP 1: Get state data
stateData <- getStateData()
# STEPS 2-4 are run only if dfPerCapita does not exist
if (is.null(dfPerCapita)) {
# STEP 2a: Download latest COVID Tracking Project data (if requested)
if (!is.null(downloadTo)) downloadCOVIDbyState(fileName=downloadTo)
# STEP 2b: Read-in COVID Tracking Project data
dfRaw <- readCOViDbyState(readFrom, checkFile=compareFile)
glimpse(dfRaw)
# STEP 3: Process the data so that it includes all requested key variables
varsFilter <- c("date", "state", "positiveIncrease", "deathIncrease",
"hospitalizedCurrently", "totalTestResultsIncrease"
)
dfFiltered <- processCVData(dfRaw,
varsKeep=varsFilter,
varsRename=c(positiveIncrease="cases",
deathIncrease="deaths",
hospitalizedCurrently="hosp",
totalTestResultsIncrease="tests"
)
)
glimpse(dfFiltered)
# STEP 4: Convert to per capita
dfPerCapita <- helperMakePerCapita(dfFiltered,
mapVars=c("cases"="cpm", "deaths"="dpm",
"hosp"="hpm", "tests"="tpm"
),
popData=stateData
)
glimpse(dfPerCapita)
} else {
dfRaw <- NULL
dfFiltered <- NULL
}
# STEP 5: Create the clusters (if they have not been passed)
if (is.null(useClusters)) {
# Run the clustering process
clData <- clusterStates(df=dfPerCapita, hierarchical=hierarchical, returnList=returnList, ...)
# If hierarchical clusters, cut the tree, otherwise use the output object directly
if (hierarchical) {
useClusters <- cutree(clData, k=kCut)
} else {
useClusters <- clData$objCluster$cluster
}
# If requested, manually assign clusters to the cluster for another state
for (xNum in seq_len(length(reAssignState))) {
useClusters[names(reAssignState)[xNum]] <- useClusters[reAssignState[[xNum]]]
}
}
# STEP 6: Create the cluster assessments
plotData <- assessClusters(useClusters,
dfState=stateData,
dfBurden=dfPerCapita,
thruLabel=thruLabel,
plotsTogether=TRUE
)
# STEP 7: Plot the consolidated metrics
subT <- "Cases: new cases, Deaths: new deaths, Hosp: total in hospital (not new), Tests: new tests"
consolidatedPlotData <- plotConsolidatedMetrics(plotData,
varMain=c("state", "cluster", "date", "pop",
"cases", "deaths", "hosp", "tests"
),
subT=subT,
nrowPlot2=2
)
# STEP 8: Create cumulative metrics if requested
if (makeCumulativePlots) {
consPos <- consolidatedPlotData %>%
ungroup() %>%
select(state, cluster, date, name, vpm7) %>%
arrange(state, cluster, date, name) %>%
pivot_wider(-vpm7, names_from="name", values_from="vpm7") %>%
mutate(pctpos=cases/tests) %>%
pivot_longer(-c(state, cluster, date), values_to="vpm7") %>%
filter(!is.na(vpm7))
clCum <- makeCumulative(consPos)
plotCumulativeData(clCum,
keyMetricp2="",
flagsp2="",
makep1=TRUE,
makep2=FALSE
)
plotCumulativeData(clCum,
keyMetricp2="deaths",
flagsp2=findFlagStates(clCum, keyMetricVal = "deaths")
)
plotCumulativeData(clCum,
keyMetricp2="cases",
flagsp2=findFlagStates(clCum, keyMetricVal = "cases")
)
plotCumulativeData(clCum,
keyMetricp2="tests",
flagsp2=findFlagStates(clCum, keyMetricVal = "tests")
)
} else {
clCum <- NULL
}
# STEP 9: Return a list of the key data
list(stateData=stateData,
dfRaw=dfRaw,
dfFiltered=dfFiltered,
dfPerCapita=dfPerCapita,
useClusters=useClusters,
plotData=plotData,
consolidatedPlotData=consolidatedPlotData,
clCum=clCum
)
}
# Test function for hierarchical clustering with Vermont reassigned to New Hampshire
test_hier5 <- readRunCOVIDTrackingProject(thruLabel="Aug 20, 2020",
readFrom="./RInputFiles/Coronavirus/CV_downloaded_200820.csv",
hierarchical=TRUE,
kCut=6,
reAssignState=list("VT"="NH"),
minShape=3,
ratioDeathvsCase = 5,
ratioTotalvsShape = 0.5,
minDeath=100,
minCase=10000
)
## Parsed with column specification:
## cols(
## .default = col_double(),
## state = col_character(),
## dataQualityGrade = col_character(),
## lastUpdateEt = col_character(),
## dateModified = col_datetime(format = ""),
## checkTimeEt = col_character(),
## dateChecked = col_datetime(format = ""),
## hash = col_character(),
## grade = col_logical()
## )
## See spec(...) for full column specifications.
##
## File is unique by state and date
##
##
## Overall control totals in file:
## # A tibble: 1 x 3
## positiveIncrease deathIncrease hospitalizedCurrently
## <dbl> <dbl> <dbl>
## 1 5546056 166127 6486294
## Observations: 9,449
## Variables: 53
## $ date <date> 2020-08-20, 2020-08-20, 2020-08-20, 20...
## $ state <chr> "AK", "AL", "AR", "AS", "AZ", "CA", "CO...
## $ positive <dbl> 5332, 112449, 54765, 0, 196280, 644751,...
## $ negative <dbl> 307315, 784330, 593744, 1514, 922163, 9...
## $ pending <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ hospitalizedCurrently <dbl> 51, 1105, 499, NA, 1070, 6212, 238, 47,...
## $ hospitalizedCumulative <dbl> NA, 13330, 3790, NA, 21143, NA, 6784, 1...
## $ inIcuCurrently <dbl> NA, NA, NA, NA, 388, 1707, NA, NA, 26, ...
## $ inIcuCumulative <dbl> NA, 1348, NA, NA, NA, NA, NA, NA, NA, N...
## $ onVentilatorCurrently <dbl> 6, NA, 108, NA, 233, NA, NA, NA, 12, NA...
## $ onVentilatorCumulative <dbl> NA, 734, 488, NA, NA, NA, NA, NA, NA, N...
## $ recovered <dbl> 1513, 44684, 48458, NA, 28471, NA, 5759...
## $ dataQualityGrade <chr> "A", "B", "A", "C", "A+", "B", "A", "B"...
## $ lastUpdateEt <chr> "8/20/2020 0:00", "8/20/2020 11:00", "8...
## $ dateModified <dttm> 2020-08-20 00:00:00, 2020-08-20 11:00:...
## $ checkTimeEt <chr> "8/19/2020 20:00", "8/20/2020 7:00", "8...
## $ death <dbl> 29, 1974, 641, 0, 4684, 11686, 1800, 44...
## $ hospitalized <dbl> NA, 13330, 3790, NA, 21143, NA, 6784, 1...
## $ dateChecked <dttm> 2020-08-20 00:00:00, 2020-08-20 11:00:...
## $ totalTestsViral <dbl> 312647, 891813, 648509, NA, 1116897, 10...
## $ positiveTestsViral <dbl> 4970, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ negativeTestsViral <dbl> 307360, NA, 593744, NA, NA, NA, NA, NA,...
## $ positiveCasesViral <dbl> 5332, 107483, 54765, 0, 194734, 644751,...
## $ deathConfirmed <dbl> 29, 1905, NA, NA, 4429, NA, NA, 3572, N...
## $ deathProbable <dbl> NA, 69, NA, NA, 255, NA, NA, 886, NA, 6...
## $ totalTestEncountersViral <dbl> NA, NA, NA, NA, NA, NA, 895207, NA, NA,...
## $ totalTestsPeopleViral <dbl> NA, NA, NA, 1514, NA, NA, 645170, NA, 2...
## $ totalTestsAntibody <dbl> NA, NA, NA, NA, 255456, NA, 150931, NA,...
## $ positiveTestsAntibody <dbl> NA, NA, NA, NA, NA, NA, 10406, NA, NA, ...
## $ negativeTestsAntibody <dbl> NA, NA, NA, NA, NA, NA, 140525, NA, NA,...
## $ totalTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ positiveTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ negativeTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestsPeopleAntigen <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ positiveTestsPeopleAntigen <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestsAntigen <dbl> NA, NA, 10358, NA, NA, NA, NA, NA, NA, ...
## $ positiveTestsAntigen <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ fips <dbl> 2, 1, 5, 60, 4, 6, 8, 9, 11, 10, 12, 13...
## $ positiveIncrease <dbl> 85, 971, 549, 0, 723, 5920, 270, 118, 5...
## $ negativeIncrease <dbl> 1713, 10462, 6680, 0, 6481, 81363, 4657...
## $ total <dbl> 312647, 896779, 648509, 1514, 1118443, ...
## $ totalTestResults <dbl> 312647, 896779, 648509, 1514, 1118443, ...
## $ totalTestResultsIncrease <dbl> 1798, 11433, 7229, 0, 7204, 87283, 7348...
## $ posNeg <dbl> 312647, 896779, 648509, 1514, 1118443, ...
## $ deathIncrease <dbl> 0, 30, 10, 0, 50, 163, 12, 1, 1, 0, 119...
## $ hospitalizedIncrease <dbl> 0, 250, 47, 0, 123, 0, 3, 72, 0, 0, 450...
## $ hash <chr> "c83a1d575a597788adccbe170950b8d197754b...
## $ commercialScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeRegularScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ positiveScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ score <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ grade <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
##
##
## Control totals - note that validState other than TRUE will be discarded
##
## # A tibble: 2 x 6
## validState cases deaths hosp tests n
## <lgl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 FALSE 29761 385 NA 392743 790
## 2 TRUE 5516295 165742 NA 69638073 8659
## Observations: 8,659
## Variables: 6
## $ date <date> 2020-08-20, 2020-08-20, 2020-08-20, 2020-08-20, 2020-08-20,...
## $ state <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", ...
## $ cases <dbl> 85, 971, 549, 723, 5920, 270, 118, 55, 75, 4555, 2759, 235, ...
## $ deaths <dbl> 0, 30, 10, 50, 163, 12, 1, 1, 0, 119, 55, 3, 8, 9, 27, 11, 0...
## $ hosp <dbl> 51, 1105, 499, 1070, 6212, 238, 47, 78, 40, 5067, 2506, 187,...
## $ tests <dbl> 1798, 11433, 7229, 7204, 87283, 7348, 12420, 2326, 2087, 295...
## Observations: 8,659
## Variables: 14
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01-26,...
## $ state <chr> "WA", "WA", "WA", "WA", "WA", "WA", "WA", "WA", "WA", "WA", ...
## $ cases <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 1, 1, 0, 3, 1, 1, 0, 0, ...
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hosp <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tests <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 1, 1, 0, 3, 1, 1, 0, 0, ...
## $ cpm <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000...
## $ dpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hpm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000...
## $ cpm7 <dbl> NA, NA, NA, 0.00000000, 0.01992331, 0.01992331, 0.01992331, ...
## $ dpm7 <dbl> NA, NA, NA, 0.00000000, 0.00000000, 0.00000000, 0.00000000, ...
## $ hpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm7 <dbl> NA, NA, NA, 0.00000000, 0.01992331, 0.01992331, 0.01992331, ...
##
## Recency is defined as 2020-07-22 through current
##
## Recency is defined as 2020-07-22 through current
# Test function for k-means clustering using the per capita data file previously created
test_km5 <- readRunCOVIDTrackingProject(thruLabel="Aug 20, 2020",
dfPerCapita=test_hier5$dfPerCapita,
hierarchical=FALSE,
makeCumulativePlots=FALSE,
minShape=3,
ratioDeathvsCase = 5,
ratioTotalvsShape = 0.5,
minDeath=100,
minCase=10000,
nCenters=5,
testCenters=1:10,
iter.max=20,
nstart=10,
seed=2008261400
)
##
## Cluster means and counts
## 1 2 3 4 5
## . 10.00 14.00 9.00 14.00 4.00
## totalCases 0.28 0.86 0.71 0.50 0.78
## totalDeaths 0.41 1.42 2.73 0.99 5.32
## cases_3 0.03 0.01 0.05 0.02 0.10
## deaths_3 0.27 0.08 0.13 0.06 0.15
## cases_4 0.07 0.06 0.26 0.12 0.50
## deaths_4 1.27 0.63 1.73 1.00 2.58
## cases_5 0.06 0.07 0.24 0.19 0.23
## deaths_5 0.69 0.66 1.77 1.50 1.59
## cases_6 0.08 0.18 0.11 0.15 0.07
## deaths_6 0.42 0.61 0.70 1.01 0.44
## cases_7 0.23 0.44 0.21 0.28 0.06
## deaths_7 0.70 1.60 0.42 0.81 0.18
## cases_8 0.16 0.22 0.14 0.19 0.04
## deaths_8 0.70 1.38 0.25 0.61 0.06
##
## Recency is defined as 2020-07-22 through current
##
## Recency is defined as 2020-07-22 through current
# Run in a session that has access to the old data files
# identical(test_hier5$useClusters, clustVec) # TRUE
# identical(test_km5$useClusters, testCluster_km5$objCluster$cluster) # TRUE
Cluster files produced using the new functions and the existing August 20, 2020 data are identical to ‘clustVec’ and ‘testCluster_km5’ produced in the _v001 code.
Further, new data is downloaded that is current through September 30, 2020, and the test_hier5 clusters are used for analysis:
# Test function for hierarchical clustering with Vermont reassigned to New Hampshire
locDownload <- "./RInputFiles/Coronavirus/CV_downloaded_201001.csv"
test_hier5_201001 <- readRunCOVIDTrackingProject(thruLabel="Sep 30, 2020",
readFrom=locDownload,
compareFile=test_hier5$dfRaw,
useClusters=test_hier5$useClusters
)
## Parsed with column specification:
## cols(
## .default = col_double(),
## state = col_character(),
## dataQualityGrade = col_character(),
## lastUpdateEt = col_character(),
## dateModified = col_datetime(format = ""),
## checkTimeEt = col_character(),
## dateChecked = col_datetime(format = ""),
## fips = col_character(),
## totalTestResultsSource = col_character(),
## hash = col_character(),
## grade = col_logical()
## )
## See spec(...) for full column specifications.
##
## File is unique by state and date
##
##
## Overall control totals in file:
## # A tibble: 1 x 3
## positiveIncrease deathIncrease hospitalizedCurrently
## <dbl> <dbl> <dbl>
## 1 7198509 198929 7830770
##
## *** COMPARISONS TO REFERENCE FILE: compareFile
##
## Checkin for similarity of: column names
## In reference but not in current:
## In current but not in reference: totalTestResultsSource
##
## Checkin for similarity of: states
## In reference but not in current:
## In current but not in reference:
##
## Checkin for similarity of: dates
## In reference but not in current:
## In current but not in reference: 2020-09-30 2020-09-29 2020-09-28 2020-09-27 2020-09-26 2020-09-25 2020-09-24 2020-09-23 2020-09-22 2020-09-21 2020-09-20 2020-09-19 2020-09-18 2020-09-17 2020-09-16 2020-09-15 2020-09-14 2020-09-13 2020-09-12 2020-09-11 2020-09-10 2020-09-09 2020-09-08 2020-09-07 2020-09-06 2020-09-05 2020-09-04 2020-09-03 2020-09-02 2020-09-01 2020-08-31 2020-08-30 2020-08-29 2020-08-28 2020-08-27 2020-08-26 2020-08-25 2020-08-24 2020-08-23 2020-08-22 2020-08-21
##
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("date", "name")
## date name newValue oldValue
## 1 2020-02-15 positiveIncrease 0 7
## 2 2020-02-16 positiveIncrease 0 7
## 3 2020-02-17 positiveIncrease 0 15
## 4 2020-02-18 positiveIncrease 0 9
## 5 2020-02-19 positiveIncrease 0 10
## 6 2020-02-20 positiveIncrease 0 13
## 7 2020-02-21 positiveIncrease 0 11
## 8 2020-02-22 positiveIncrease 0 13
## 9 2020-02-23 positiveIncrease 0 16
## 10 2020-02-24 positiveIncrease 0 26
## 11 2020-02-25 positiveIncrease 0 31
## 12 2020-02-26 positiveIncrease 0 29
## 13 2020-02-27 positiveIncrease 0 27
## 14 2020-02-28 positiveIncrease 0 40
## 15 2020-02-29 positiveIncrease 18 24
## 16 2020-03-01 positiveIncrease 16 78
## 17 2020-03-02 positiveIncrease 44 82
## 18 2020-03-03 positiveIncrease 48 100
## 19 2020-03-04 positiveIncrease 62 110
## 20 2020-03-05 positiveIncrease 103 151
## 21 2020-03-06 positiveIncrease 108 137
## 22 2020-03-07 positiveIncrease 175 217
## 23 2020-03-08 positiveIncrease 198 267
## 24 2020-03-09 positiveIncrease 292 367
## 25 2020-03-10 positiveIncrease 387 441
## 26 2020-03-11 positiveIncrease 509 527
## 27 2020-03-13 positiveIncrease 1072 1025
## 28 2020-03-15 positiveIncrease 1291 1251
## 29 2020-03-16 positiveIncrease 1739 1560
## 30 2020-03-17 positiveIncrease 2588 3613
## 31 2020-03-18 positiveIncrease 3089 3171
## 32 2020-03-20 positiveIncrease 6147 6255
## 33 2020-03-21 positiveIncrease 6793 6885
## 34 2020-03-22 positiveIncrease 9125 9259
## 35 2020-03-24 positiveIncrease 10769 10632
## 36 2020-04-03 hospitalizedCurrently 25729 25472
## 37 2020-04-29 positiveIncrease 26168 26641
## 38 2020-04-30 positiveIncrease 29975 29568
## 39 2020-05-08 positiveIncrease 27222 27605
## 40 2020-05-09 positiveIncrease 25218 24810
## 41 2020-05-10 positiveIncrease 21055 21504
## 42 2020-05-12 positiveIncrease 22890 22663
## 43 2020-05-26 positiveIncrease 16825 16629
## 44 2020-06-08 positiveIncrease 17209 17012
## 45 2020-06-19 positiveIncrease 31471 31046
## 46 2020-06-21 positiveIncrease 27928 27284
## 47 2020-06-23 positiveIncrease 33447 33021
## 48 2020-06-24 positiveIncrease 39075 38684
## 49 2020-06-25 positiveIncrease 39637 39072
## 50 2020-06-27 positiveIncrease 43164 43783
## 51 2020-06-29 positiveIncrease 39813 39175
## 52 2020-07-26 positiveIncrease 61000 61713
## 53 2020-07-28 positiveIncrease 59003 56229
## 54 2020-07-29 positiveIncrease 64408 66969
## 55 2020-08-02 positiveIncrease 46812 48266
## 56 2020-08-09 positiveIncrease 50627 51365
## 57 2020-08-16 positiveIncrease 42487 43083
## 58 2020-08-20 positiveIncrease 43758 43245
## 59 2020-08-20 deathIncrease 1134 1117
## Joining, by = c("date", "name")
##
##
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("state", "name")
## state name newValue oldValue
## 1 WA positiveIncrease 70973 68687
## 2 WI hospitalizedCurrently 45951 51401
## Observations: 11,802
## Variables: 54
## $ date <date> 2020-09-30, 2020-09-30, 2020-09-30, 20...
## $ state <chr> "AK", "AL", "AR", "AS", "AZ", "CA", "CO...
## $ positive <dbl> 8780, 154701, 83697, 0, 218507, 810625,...
## $ negative <dbl> 448427, 994475, 949107, 1571, 1243641, ...
## $ pending <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestResults <dbl> 457207, 1132039, 1029717, 1571, 1457575...
## $ hospitalizedCurrently <dbl> 53, 776, 484, NA, 560, 3267, 268, 104, ...
## $ hospitalizedCumulative <dbl> NA, 17257, 5354, NA, 22119, NA, 7558, 1...
## $ inIcuCurrently <dbl> NA, NA, 218, NA, 115, 830, NA, NA, 28, ...
## $ inIcuCumulative <dbl> NA, 1815, NA, NA, NA, NA, NA, NA, NA, N...
## $ onVentilatorCurrently <dbl> 7, NA, 93, NA, 55, NA, NA, NA, 9, NA, N...
## $ onVentilatorCumulative <dbl> NA, 1025, 676, NA, NA, NA, NA, NA, NA, ...
## $ recovered <dbl> 4555, 67948, 75312, NA, 35261, NA, 6500...
## $ dataQualityGrade <chr> "A", "A", "A+", "D", "A+", "B", "A", "B...
## $ lastUpdateEt <chr> "9/30/2020 03:59", "9/30/2020 11:00", "...
## $ dateModified <dttm> 2020-09-30 03:59:00, 2020-09-30 11:00:...
## $ checkTimeEt <chr> "09/29 23:59", "09/30 07:00", "09/29 20...
## $ death <dbl> 56, 2540, 1369, 0, 5650, 15792, 1952, 4...
## $ hospitalized <dbl> NA, 17257, 5354, NA, 22119, NA, 7558, 1...
## $ dateChecked <dttm> 2020-09-30 03:59:00, 2020-09-30 11:00:...
## $ totalTestsViral <dbl> 457207, 1132039, 1029717, 1571, NA, 147...
## $ positiveTestsViral <dbl> 7807, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ negativeTestsViral <dbl> 449109, NA, 949107, NA, NA, NA, NA, NA,...
## $ positiveCasesViral <dbl> 8780, 137564, 80610, 0, 213934, 810625,...
## $ deathConfirmed <dbl> 56, 2399, 1223, NA, 5382, NA, NA, 3610,...
## $ deathProbable <dbl> NA, 141, 146, NA, 268, NA, NA, 898, NA,...
## $ totalTestEncountersViral <dbl> NA, NA, NA, NA, NA, NA, 1341180, NA, 38...
## $ totalTestsPeopleViral <dbl> NA, NA, NA, NA, 1457575, NA, 902242, NA...
## $ totalTestsAntibody <dbl> NA, NA, NA, NA, 293098, NA, 167989, NA,...
## $ positiveTestsAntibody <dbl> NA, NA, NA, NA, NA, NA, 11733, NA, NA, ...
## $ negativeTestsAntibody <dbl> NA, NA, NA, NA, NA, NA, 156256, NA, NA,...
## $ totalTestsPeopleAntibody <dbl> NA, 58393, NA, NA, NA, NA, NA, NA, NA, ...
## $ positiveTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ negativeTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestsPeopleAntigen <dbl> NA, NA, 11700, NA, NA, NA, NA, NA, NA, ...
## $ positiveTestsPeopleAntigen <dbl> NA, NA, 3383, NA, NA, NA, NA, NA, NA, N...
## $ totalTestsAntigen <dbl> NA, NA, 21856, NA, NA, NA, NA, NA, NA, ...
## $ positiveTestsAntigen <dbl> NA, NA, 3300, NA, NA, NA, NA, NA, NA, N...
## $ fips <chr> "02", "01", "05", "60", "04", "06", "08...
## $ positiveIncrease <dbl> 106, 1147, 942, 0, 323, 3200, 535, 221,...
## $ negativeIncrease <dbl> 6045, 11312, 21205, 0, 3735, 88457, 674...
## $ total <dbl> 457207, 1149176, 1032804, 1571, 1462148...
## $ totalTestResultsSource <chr> "posNeg", "posNeg", "posNeg", "posNeg",...
## $ totalTestResultsIncrease <dbl> 6151, 12327, 21812, 0, 4047, 91657, 145...
## $ posNeg <dbl> 457207, 1149176, 1032804, 1571, 1462148...
## $ deathIncrease <dbl> 0, 23, 19, 0, 18, 152, 7, 3, 1, 1, 175,...
## $ hospitalizedIncrease <dbl> 0, 75, 0, 0, 72, 0, 28, 0, 0, 0, 253, 1...
## $ hash <chr> "1716aba4b705ff74814829d25487c6e57d2712...
## $ commercialScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeRegularScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ positiveScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ score <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ grade <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
##
##
## Control totals - note that validState other than TRUE will be discarded
##
## # A tibble: 2 x 6
## validState cases deaths hosp tests n
## <lgl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 FALSE 52622 732 NA 442114 995
## 2 TRUE 7145887 198197 NA 103496519 10807
## Observations: 10,807
## Variables: 6
## $ date <date> 2020-09-30, 2020-09-30, 2020-09-30, 2020-09-30, 2020-09-30,...
## $ state <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", ...
## $ cases <dbl> 106, 1147, 942, 323, 3200, 535, 221, 26, 82, 1948, 1720, 87,...
## $ deaths <dbl> 0, 23, 19, 18, 152, 7, 3, 1, 1, 175, 27, 0, 16, 4, 35, 20, 4...
## $ hosp <dbl> 53, 776, 484, 560, 3267, 268, 104, 95, 72, 2097, 1896, 147, ...
## $ tests <dbl> 6151, 12327, 21812, 4047, 91657, 14538, 12401, 1464, 1074, 1...
## Observations: 10,807
## Variables: 14
## $ date <date> 2020-01-22, 2020-01-22, 2020-01-23, 2020-01-23, 2020-01-24,...
## $ state <chr> "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", ...
## $ cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hosp <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tests <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ cpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ dpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hpm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm <dbl> 0.0000000, 0.0000000, 0.1471796, 0.0000000, 0.0000000, 0.000...
## $ cpm7 <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ dpm7 <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm7 <dbl> NA, NA, NA, NA, NA, NA, 0.04205130, 0.00000000, 0.06307695, ...
##
## Recency is defined as 2020-09-01 through current
##
## Recency is defined as 2020-09-01 through current
Segments appear to be on trend with previous analysis, with what was previously called “late pandemic” having peaked and the segments that had smaller deaths per million showing higher percentage increases.
The second section is for analysis of data from USA Facts. This site contains county-level data for the US focused on coronavirus cases and deaths. Data are provided with one row per county and one column for each date, and are thus unique by county. There are differences in state and US totals for coronavirus cases and deaths when reported by COVID Tracking Project and USA Facts, though trends, timing, and relative magnitudes are generally in agreement between the two sources.
Functions for reading and analyzing data from USA Facts include:
# Function to plot per capita data by county
plotCountyPerCapita <- function(dfDisease,
burdenVar,
useDate,
plotTitle="",
inclStates=NULL,
dfCounty=filter(pop_usafacts, countyFIPS!=0),
popVar="population",
highColor="darkblue",
maxPerCap=NULL,
printPlot=TRUE,
returnData=!printPlot
) {
# FUNCTION ARGUMENTS
# dfDisease: file containing disease data
# burdenVar: variable for disease burden (cumulative) on date
# useDate: date for the analysis
# plotTitle: title for the plot
# inclStates: states to include (NULL means include all)
# dfCounty: data for county-level population
# popVar: variable for population in the dfCounty file
# maxPerCap: the maximum amount to be used for per capita (everything at or above shaded the same)
# highColor: the color to be used for high disease burden
# printPlot: boolean, whether to print the plot
# returnData: boolean, whether to return the underlying data (if FALSE, the plot object is returned)
# Create the relevant data frame
dfData <- dfDisease %>%
left_join(dfCounty, by=c("countyFIPS", "state")) %>%
filter(date %in% useDate, countyFIPS!=0, population>0) %>%
mutate(burden=1000000*get(burdenVar)/get(popVar))
# Modify inclStates to be every state (if needed due to NULL)
if (is.null(inclStates)) inclStates <- dfData %>% pull(state) %>% unique() %>% sort()
# Create the relevant plot (this is necessary if printPlot is TRUE or returnData is FALSE)
if (printPlot | !returnData) {
p1 <- dfData %>%
mutate(burden=if(is.null(maxPerCap)) burden else pmin(burden, maxPerCap)) %>%
select(fips=countyFIPS, burden) %>%
usmap::plot_usmap(regions="counties", data=., values="burden", include=inclStates)
if (is.null(maxPerCap))
p1 <- p1 + scale_fill_continuous(plotTitle, low="white", high=highColor)
else {
p1 <- p1 +
scale_fill_continuous(plotTitle, low="white", high=highColor, limits=c(0, maxPerCap)) +
labs(caption=paste0("Values at/above ", maxPerCap, " plotted as ", maxPerCap))
}
}
# Print the plot if requested
if (printPlot) print(p1)
# Return the data if requested, otherwise return the plot object
if (returnData) dfData %>% filter(state %in% inclStates) else p1
}
# Function to make the cases and deaths by county plot
casesDeathsByCounty <- function(useDate,
inclStates=NULL,
caseData=pvtCases_20200903,
deathData=pvtDeaths_20200903,
highCaseColor="darkblue",
highDeathColor="darkred",
highCaseAmount=NULL,
highDeathAmount=NULL,
mainTitle=NULL
) {
# FUNCTION ARGUMENTS:
# useDate: the date to be plotted
# inclStates: the states to be included (NULL means all)
# caseData: the frame containing the cases date
# deathData: the frame containing the death data
# highCaseColor: color for the highest level of cases in a county
# highDeathColor: color for the highest level of deaths in a county
# highCaseAmount: cases at/above this level will be the same color (NULL means leave as-is)
# highDeathAmount: deaths at/above this level will be the same color (NULL means leave as-is)
# mainTitle: main title for the grid.arrange (NULL means useDate will be used)
# Convert useDate to date if not already in that format
if (!("Date") %in% class(useDate)) useDate <- as.Date(useDate)
# Convert inclStates to be all states if not specified
if (is.null(inclStates)) inclStates <- caseData %>% pull(state) %>% unique() %>% sort()
# Create mainTitle if not passed
if (is.null(mainTitle))
mainTitle <- paste0("Per capita coronavirus burden as of ", format(useDate, "%B %d, %Y"))
# Create the plot for deaths
deathPlot <- plotCountyPerCapita(deathData,
burdenVar="cumDeaths",
useDate=useDate,
plotTitle="Deaths\nper million",
inclStates=inclStates,
highColor=highDeathColor,
maxPerCap=highDeathAmount,
printPlot=FALSE,
returnData=FALSE
)
# Create the plot for cases
casesPlot <- plotCountyPerCapita(caseData,
burdenVar="cumCases",
useDate=useDate,
plotTitle="Cases\nper million",
inclStates=inclStates,
highColor=highCaseColor,
maxPerCap=highCaseAmount,
printPlot=FALSE,
returnData=FALSE
)
# Print the combined plot object
gridExtra::grid.arrange(casesPlot,
deathPlot,
nrow=1,
top=grid::textGrob(mainTitle,
gp=grid::gpar(fontface=2, fontsize=12),
hjust=0,
x=0.05
)
)
}
# Function to plot evolution of county-level burdens
countyLevelEvolution <- function(dfBurden,
burdenVar,
inclStates=NULL,
topN=10,
topNDate=NULL,
printPlot=TRUE,
returnData=TRUE,
plotTitle=NULL,
countyPopFloor=0,
subT=NULL
) {
# FUNCTION ARGUMENTS:
# dfBurden: file containing the relevant per-capita burden data
# burdenVar: the name of the variable containing the burden per-capita data
# inclStates: states to be included (default NULL means include all)
# topN: integer, number of counties to flag as "top"
# topNDate: the data to use as the topN cutpoint (NULL means most recent)
# printPlot: boolean, whether to print the plot
# returnData: boolean, if TRUE return the per-capita data file, otherwise return the plot object
# plotTitle: title for the plot (NULL means assume from burdenVar)
# countyPopFloor: floor for county population for the county to be plotted
# subT: subtitle for the chart (NULL means none)
# Adjust inclStates if NULL
if (is.null(inclStates)) inclStates <- dfBurden %>% count(state) %>% pull(state)
# Create plotTitle if needed
if (is.null(plotTitle)) {
plotTitle <- if(stringr::str_detect(string=stringr::str_to_lower(burdenVar), pattern="deaths")) {
"Cumulative per-capita deaths by county"
} else {
"Cumulative per-capita cases by county"
}
}
# Get all possible dates
allDates <- dfBurden %>% count(date) %>% pull(date)
# Get the data for the counties in the relevant states (return data only and do not plot)
perCapData <- plotCountyPerCapita(dfBurden,
burdenVar=burdenVar,
useDate=allDates,
inclStates=inclStates,
printPlot=FALSE,
returnData=TRUE
)
# Get the relevant top-N date and convert to Date if not already of that type
if (is.null(topNDate)) topNDate <- perCapData %>% pull(date) %>% max()
if (!("Date" %in% class(topNDate))) topNDate <- as.Date(topNDate)
# Get the top-n counties by burdenVar
# Top 10 counties hit by FIPS, counting only those that exceed the population floor
topN <- perCapData %>%
filter(date==topNDate, population>=countyPopFloor) %>%
arrange(-burden) %>%
head(topN) %>%
pull(countyFIPS)
# Update perCapData with easy-read county name and bolding instructions
perCapData <- perCapData %>%
mutate(county=paste0(str_replace(countyName.x, " County", ""), " (", state, ")"),
bold=ifelse(countyFIPS %in% topN, 1, 0)
) %>%
arrange(date)
# Create the plot of all counties with the topN flagged
# Evolution of per capita deaths by date in the northeast
p1 <- perCapData %>%
filter(population>=countyPopFloor) %>%
ggplot(aes(x=date, y=burden)) +
geom_line(aes(group=countyFIPS, color=state, alpha=0.25 + 0.75*bold, size=0.5+0.5*bold)) +
scale_alpha_identity() +
scale_size_identity() +
geom_text(data=~filter(., bold==1, date==max(date)),
aes(x=date+lubridate::days(2), label=paste0(county, ": ", round(burden)), color=state),
size=3,
fontface="bold",
hjust=0
) +
scale_x_date(date_breaks="1 months", date_labels="%m", expand=expand_scale(mult=c(0, 0.4))) +
labs(x="", y="Burden per million people", title=plotTitle) +
theme(legend.position="bottom")
# Add the subtitle if passed
if (!is.null(subT)) p1 <- p1 + labs(subtitle=subT)
# Print the plot if requested
if (printPlot) print(p1)
# Return the relevant object
if (returnData) perCapData else p1
}
# Function for calling assessClusters() for county-level segments
helperAssessCountyClusters <- function(vecCluster,
dfPop,
dfBurden,
dfPopGeoVar="state",
dfPopPopVar="population",
...
) {
# FUNCTION ARGUMENTS:
# vecCluster: the named cluster vector
# dfPop: the data frame containing the population data
# dfBurden: the data frame containing the burden statistics by county and date
# ...: other arguments to pass to assessClusters()
# Run the process
plotAssess <- assessClusters(vecCluster,
dfState=dfPop %>%
group_by_at(dfPopGeoVar) %>%
summarize(pop=mean(get(dfPopPopVar))) %>%
ungroup(),
dfBurden=select(dfBurden, -population),
isCounty=TRUE,
...
)
# Return the plot object
plotAssess
}
# Helper function to make the overall cluster summary statistics
helperClusterSummaryPlots <- function(dfState,
dfPlot,
showMap,
clusterPlotsTogether,
weightedMean=TRUE,
mapLevel="states"
) {
# FUNCTION ARGUMENTS:
# dfState: contains the state/county-level data
# dfPlot: contains the joined data for plotting
# showMap: boolean for whether to create a map (if FALSE, segment membership counts are shown instead)
# clusterPlotsTogether: boolean, whether to put all four plots on the same page
# weightedMean: boolean, whether to create weighted mean by segment (if FALSE, median by segment is taken)
# mapLevel: the level to be used on the map
# Reorder the cluster levels in dfState to match dfPlot
# Convert factor order to match dfPlot (can be reordered by argument to the calling function)
dfState <- dfState %>%
mutate(cluster=factor(cluster, levels=levels(dfPlot$cluster)))
# Plot the segments on a map or show segment membership
if (showMap) {
if (mapLevel=="counties") {
dfMap <- dfState %>%
mutate(fips=stringr::str_pad(state, width=5, side="left", pad="0")) %>%
select(fips, cluster)
} else {
dfMap <- dfState
}
# Create the map
p1 <- usmap::plot_usmap(regions=mapLevel, data=dfMap, values="cluster") +
scale_fill_discrete("cluster") +
theme(legend.position="right")
} else {
p1 <- dfState %>%
count(cluster) %>%
ggplot(aes(x=fct_rev(cluster), y=n)) +
geom_col(aes(fill=cluster)) +
geom_text(aes(y=n/2, label=n)) +
coord_flip() +
labs(x="", y="# Counties", title="Membership by segment")
}
# Plot the population totals by segment
# Show population totals by cluster
p2 <- dfState %>%
group_by(cluster) %>%
summarize(pop=sum(pop)/1000000) %>%
ggplot(aes(x=fct_rev(cluster), y=pop)) +
geom_col(aes(fill=cluster)) +
geom_text(aes(y=pop/2, label=round(pop))) +
labs(y="2015 population (millions)", x="Cluster", title="Population by cluster (millions)") +
coord_flip()
# Plot the rolling 7-day mean daily disease burden by cluster
# Create the p3Data to be either median of all elements in cluster or weighted mean
p3 <- dfPlot %>%
select(date, cluster, cases=cpm7, deaths=dpm7, pop) %>%
pivot_longer((-c(date, cluster, pop))) %>%
filter(!is.na(value)) %>%
group_by(date, cluster, name) %>%
summarize(mdnValue=median(value), wtdValue=sum(pop*value)/sum(pop)) %>%
ggplot(aes(x=date, y=if(weightedMean) wtdValue else mdnValue)) +
geom_line(aes(group=cluster, color=cluster)) +
facet_wrap(~name, scales="free_y") +
labs(x="",
y="Rolling 7-day mean, per million",
title="Rolling 7-day mean daily disease burden, per million",
subtitle=paste0(if(weightedMean) "Weighted mean" else "Median",
" per day for states assigned to cluster"
)
) +
scale_x_date(date_breaks="1 months", date_labels="%b")
# Plot the total cases and total deaths by cluster
p4 <- dfPlot %>%
group_by(cluster) %>%
summarize(cases=sum(cases), deaths=sum(deaths)) %>%
pivot_longer(-cluster) %>%
ggplot(aes(x=fct_rev(cluster), y=value/1000)) +
geom_col(aes(fill=cluster)) +
geom_text(aes(y=value/2000, label=round(value/1000))) +
coord_flip() +
facet_wrap(~varMapper[name], scales="free_x") +
labs(x="Cluster", y="Burden (000s)", title="Total cases and deaths by segment")
# Place the plots together if plotsTogether is TRUE, otherwise just print
if (isTRUE(clusterPlotsTogether)) {
gridExtra::grid.arrange(p1, p2, p3, p4, nrow=2, ncol=2)
} else {
print(p1); print(p2); print(p3); print(p4)
}
}
# Helper function to make total and per capita by state (calls its own helper function)
helperCallTotalPerCapita <- function(dfPlot,
thruLabel
) {
# FUNCTION ARGUMENTS:
# dfPlot: the plotting data frame
# thruLabel: the date that data are through
# Plot total cases and total deaths by state, colored by cluster
helperBarDeathsCases(dfPlot,
numVars=c("cases", "deaths"),
title=paste0("Coronavirus impact by state through ", thruLabel),
xVar=c("state"),
fillVar=c("cluster")
)
# Plot cases per million and deaths per million by state, colored by cluster
helperBarDeathsCases(dfPlot,
numVars=c("cpm", "dpm"),
title=paste0("Coronavirus impact by state through ", thruLabel),
xVar=c("state"),
fillVar=c("cluster")
)
}
# Helper function to make recent vs. total plots
helperCallRecentvsTotal <- function(dfPlot,
thruLabel,
labelPoints,
recentTotalTogether
) {
# FUNCTION ARGUMENTS:
# dfPlot: the plotting data frame
# thruLabel: the date that data are through
# labelPoints: boolean, whether to label the individual points
# recentTotalTogether: boolean, whether to put these plots together on one page
# Plot last-30 vs total for cases per million by state, colored by cluster
p7 <- helperRecentvsTotal(dfPlot,
xVar="cpm",
yVar="newcpm",
title=paste0("Coronavirus burden through ", thruLabel),
labelPlot=labelPoints,
printPlot=FALSE
)
# Plot last-30 vs total for deaths per million by state, colored by cluster
p8 <- helperRecentvsTotal(dfPlot,
xVar="dpm",
yVar="newdpm",
title=paste0("Coronavirus burden through ", thruLabel),
labelPlot=labelPoints,
printPlot=FALSE
)
# Print the plots either as a single page or separately
if (isTRUE(recentTotalTogether)) {
gridExtra::grid.arrange(p7, p8, nrow=1)
} else {
print(p7); print(p8)
}
}
# Helper function to create total vs. elements plots
helperCallTotalvsElements <- function(dfPlot,
aggAndTotal,
clusterAggTogether,
...
) {
# FUNCTION ARGUMENTS:
# dfPlot: the plotting data frame
# aggAndTotal: boolean, should each individual line be plotted (if FALSE an 80% CI is plotted instead)
# clusterAggTogether: boolean, whether to print the plots all on a single page
# ...: any other arguments to pass to helperTotalvsElements (especially pctRibbon or aggFunc)
# Plot the cases per million on a free y-scale and a fixed y-scale
p9 <- helperTotalvsElements(dfPlot,
keyVar="cpm7",
aggAndTotal=aggAndTotal,
title="Cases per million, 7-day rolling mean",
printPlot=FALSE,
...
)
p10 <- helperTotalvsElements(dfPlot,
keyVar="cpm7",
aggAndTotal=aggAndTotal,
title="Cases per million, 7-day rolling mean",
facetScales="fixed",
printPlot=FALSE,
...
)
# Plot the deaths per million on a free y-scale and a fixed y-scale
p11 <- helperTotalvsElements(dfPlot,
keyVar="dpm7",
aggAndTotal=aggAndTotal,
title="Deaths per million, 7-day rolling mean",
printPlot=FALSE,
...
)
p12 <- helperTotalvsElements(dfPlot,
keyVar="dpm7",
aggAndTotal=aggAndTotal,
title="Deaths per million, 7-day rolling mean",
facetScales="fixed",
printPlot=FALSE,
...
)
if (isTRUE(clusterAggTogether)) {
gridExtra::grid.arrange(p9, p11, nrow=1)
gridExtra::grid.arrange(p10, p12, nrow=1)
} else {
print(p9); print(p10); print(p11); print(p12)
}
}
# Updated function for cluster assessment
# Main function creates the required data and calls helper functions
# 1. Call helper to create overall cluster summary
# 2. Call helper function for totals and per capita by state
# 3. Call helper for recent vs. overall cases
# 4. Call helper for evolution of segment aggregate and individual components
assessClusters <- function(clusters,
dfState=stateData,
dfBurden=cvFilteredPerCapita,
thruLabel="Aug 20, 2020",
isCounty=FALSE,
plotsTogether=FALSE,
clusterPlotsTogether=plotsTogether,
recentTotalTogether=plotsTogether,
clusterAggTogether=plotsTogether,
makeSummaryPlots=TRUE,
makeTotalvsPerCapitaPlots=!isCounty,
makeRecentvsTotalPlots=TRUE,
makeTotalvsElementPlots=TRUE,
showMap=!isCounty,
orderCluster=FALSE
) {
# FUNCTION ARGUMENTS:
# clusters: the named vector containing the clusters by state
# dfState: the file containing the states and populations
# dfBurden: the data containing the relevant per capita burden statistics by state-date
# thruLabel: label for plots for 'data through'
# isCounty: boolean, is this a plot of county-level data that have been named 'state'?
# FALSE means that it is normal state-level data
# plotsTogether: boolean, should plots be consolidated on fewer pages?
# clusterPlotsTogether: boolean, should plots p1-p4 be consolidated?
# recentTotalTogether: boolean, should recent total plots p7-p8 be consolidated?
# clusterAggTogether: boolean, should aggregate plots p9/p11 and p10/p12 be consolidated?
# makeSummaryPlots: boolean, should the summary plots be made?
# makeTotalvsPerCapitaPlots: boolean, should the total and per capita plots be produced?
# makeRecentvsTotalPlots: boolean, should the recent vs. total plots be created?
# makeTotalvsElementPlots: boolean, should the total vs. element plots be created?
# showMap: boolean, whether to create a map colored by cluster (will show segment counts otherwise)
# orderCluster: boolean, whether to order the clusters based on disease burden
# ...: any additional arguments passed from a calling function
# most common would be orderCluster=TRUE, a request for the clusters to be ordered by disease burden
# Attach the clusters to the state population data
dfState <- as.data.frame(clusters) %>%
set_names("cluster") %>%
rownames_to_column("state") %>%
inner_join(dfState, by="state") %>%
mutate(cluster=factor(cluster))
# Plot the rolling 7-day mean dialy disease burden by cluster
dfPlot <- dfState %>%
inner_join(dfBurden, by="state") %>%
tibble::as_tibble()
# Reorder the clusters if requested
if (orderCluster) {
dfPlot <- changeOrderLabel(dfPlot, grpVars="state")
}
# Call the helper function to make the overall summary statistics
if (makeSummaryPlots) {
helperClusterSummaryPlots(dfState=dfState,
dfPlot=dfPlot,
showMap=showMap,
clusterPlotsTogether=clusterPlotsTogether,
mapLevel=if(isCounty) "counties" else "states"
)
}
# These are primarily useful for state-level data and default to FALSE when isCounty is TRUE
if (makeTotalvsPerCapitaPlots) {
helperCallTotalPerCapita(dfPlot=dfPlot, thruLabel=thruLabel)
}
# Call the helper function to make recent vs. total plots
if (makeRecentvsTotalPlots) {
helperCallRecentvsTotal(dfPlot=dfPlot,
thruLabel=thruLabel,
labelPoints=!isCounty,
recentTotalTogether = recentTotalTogether
)
}
# Call the total vs. elements helper function
if (makeTotalvsElementPlots) {
helperCallTotalvsElements(dfPlot=dfPlot,
aggAndTotal=!isCounty,
clusterAggTogether=clusterPlotsTogether
)
}
# Return the plotting data frame
dfPlot
}
# Takes a list and creates the desired plot
assessFunction <- function(x) {
helperAssessCountyClusters(x$objCluster$cluster,
dfPop=countyFiltered,
dfBurden=countyFiltered,
thruLabel="Sep 3, 2020",
plotsTogether=TRUE,
makeTotalvsPerCapitaPlots=FALSE,
makeRecentvsTotalPlots=FALSE,
makeTotalvsElementPlots=FALSE,
showMap=FALSE
)
}
# Helper function for pretty labeling and position of bar chart labels
helperTextLabel <- function(vrbl,
groupVar=NULL,
groupValue=NULL,
divBy=1,
roundTo=0,
pctMin=0.1,
pctOut=0.025,
...
) {
# FUNCTION ARGUMENTS
# vrbl: the variable to be plotted
# groupVar: the grouping variable for deciding whether to use middle of the bar or outside
# NULL means no grouping variable
# groupValue: the value to be used for groupVar
# divBy: the amount to divide the variable by (e.g., 1000 means y-axis will be in 000s)
# roundTo: the amount to round the displayed totals to
# pctMin: any amount above this will be centered, any amount below this will be outside
# pctOut: amounts on the outside will be this amount of maximum outside
# ...: any other arguments to pass "as is" to geom_text
# Create a relevant geom_text object
# 1. label should be vrbl/divBy, rounded to roundTo and using commas as needed
# 2. y position should be centered midpoint OR left-justified end of bar
geom_text(data=if(!is.null(groupVar)) ~filter(., get(groupVar) %in% groupValue) else ~.,
aes(y=ifelse(get(vrbl)>=pctMin*max(get(vrbl)),
get(vrbl)/2/divBy,
get(vrbl)/divBy + pctOut*max(get(vrbl))/divBy
),
label=scales::comma(get(vrbl)/divBy, accuracy=10**-roundTo),
hjust=ifelse(get(vrbl)>=pctMin*max(get(vrbl)), NA, 0)
),
...
)
}
# Function to run the data for a given state(s)
stateCountySummary <- function(states,
df=fullStateData,
keyDate=as.Date("2020-08-31"),
showQuadrants=TRUE,
showCumulative=FALSE,
facetCumulativeByState=FALSE,
showAllFactorLevels=FALSE,
returnData=FALSE
) {
# FUNCTION ARGUMENTS:
# states: the states to include in the analysis
# df: the data frame or tibble to use
# keyDate: date to use for getting cluster and population by county
# showQuadrants: boolean, whether to create and show the four-quadrants of p1-p4
# showCumulative: boolean, whether to show the cumulative deaths per capita by segment
# facetCumulativeByState: boolean, whether to facet by state for the cumulative plot
# showAllFactorLevels: boolean, whether to show all factor levels in the plot legend
# returnData: boolean, should the data frame be returned?
# Create a base data frame
baseStateData <-df %>%
arrange(fipsCounty, date) %>%
group_by(fipsCounty) %>%
mutate(cumDeaths=cumsum(deaths)) %>%
filter(state %in% c(states))
# Create a frame by state and cluster
stateClusterData <- baseStateData %>%
group_by(state, cluster, date) %>%
summarize(n=n(), dpm7=sum(dpm7*pop)/sum(pop), cumDeaths=sum(cumDeaths), pop=sum(pop)) %>%
ungroup()
# Create a frame by cluster
clusterData <- baseStateData %>%
group_by(cluster, date) %>%
summarize(n=n(), dpm7=sum(dpm7*pop)/sum(pop), cumDeaths=sum(cumDeaths), pop=sum(pop)) %>%
ungroup()
# Create and show p1-p4 if requested
if (showQuadrants) {
# Plot 1: Number of counties in cluster
p1 <- clusterData %>%
filter(date==keyDate) %>%
ggplot(aes(x=fct_rev(cluster), y=n, fill=cluster)) +
geom_col() +
geom_text(aes(y=n/2, label=n)) +
coord_flip() +
labs(x="Cluster", y="# Counties", title="Counties by Cluster")
if (showAllFactorLevels) p1 <- p1 + scale_fill_discrete(drop=FALSE)
# Plot 2: Total population and average population by cluster
p2 <- clusterData %>%
filter(date==keyDate) %>%
mutate(popPer=pop/n) %>%
select(cluster, pop, popPer) %>%
pivot_longer(-cluster) %>%
ggplot(aes(x=fct_rev(cluster), y=value/1000, fill=cluster)) +
geom_col() +
helperTextLabel("value", divBy=1000, groupVar="name", groupValue="popPer", pctMin=0.2, size=3.5) +
helperTextLabel("value", divBy=1000, groupVar="name", groupValue="pop", pctMin=0.2, size=3.5) +
coord_flip() +
facet_wrap(~c("pop"="Population", "popPer"="Population per county")[name], scales="free_x") +
labs(x="Cluster", y="Population (000)", title="Population by Cluster")
if (showAllFactorLevels) p2 <- p2 + scale_fill_discrete(drop=FALSE)
# Plot 3: Deaths and deaths per million (total) by cluster
p3 <- clusterData %>%
filter(date==keyDate) %>%
mutate(dpm=1000000*cumDeaths/pop) %>%
select(cluster, cumDeaths, dpm) %>%
pivot_longer(-cluster) %>%
ggplot(aes(x=fct_rev(cluster), y=value, fill=cluster)) +
geom_col() +
helperTextLabel("value", groupVar="name", groupValue="cumDeaths", pctMin=0.2, size=3.5) +
helperTextLabel("value", groupVar="name", groupValue="dpm", pctMin=0.2, size=3.5) +
coord_flip() +
facet_wrap(~c("cumDeaths"="1. Total deaths", "dpm"="2. Deaths per million")[name],
scales="free_x"
) +
labs(x="Cluster", y="", title="Deaths (total and per million) by cluster")
if (showAllFactorLevels) p3 <- p3 + scale_fill_discrete(drop=FALSE)
# Plot 4: Evolution of deaths by time
p4 <- clusterData %>%
filter(!is.na(dpm7)) %>%
ggplot(aes(x=date, y=dpm7, group=cluster, color=cluster)) +
geom_line(size=1) +
labs(x="", y="Deaths per million", title="Deaths per million, 7-day rolling mean") +
scale_x_date(date_breaks="1 months", date_labels="%b")
if (showAllFactorLevels) p4 <- p4 + scale_color_discrete(drop=FALSE)
# Create a 2x2 summary of the cluster data
gridExtra::grid.arrange(p1, p2, p3, p4, nrow=2, top=paste0("States: ", paste0(states, collapse=", ")))
}
# If requested, create a cumulative deaths per capita curve
if (showCumulative) {
p5 <- (if (facetCumulativeByState) stateClusterData else clusterData) %>%
mutate(cumdpm=1000000*cumDeaths/pop) %>%
ggplot(aes(x=date, y=cumdpm)) +
geom_line(aes(group=cluster, color=cluster), lwd=1) +
geom_text(data=~filter(., date==max(date, na.rm=TRUE)),
aes(x=date+lubridate::days(3), label=scales::comma(cumdpm, accuracy=1), color=cluster),
size=3,
hjust=0
) +
labs(x="",
y="Cumulative deaths per million",
title=paste0("Cumulative deaths per million for: ", paste0(states, collapse=", "))
) +
scale_x_date(date_breaks="1 months", date_labels="%b", expand=expand_scale(c(0, 0.15)))
if (facetCumulativeByState) p5 <- p5 + facet_wrap(~state)
if (showAllFactorLevels) p5 <- p5 + scale_color_discrete(drop=FALSE)
print(p5)
}
# Return the data file if requested (by state-cluster if facetCumulativeByState, by cluster otherwise)
if (returnData) {
if (facetCumulativeByState) stateClusterData else stateClusterData
}
}
# Function to reorder and relabel factors
changeOrderLabel <- function(df,
fctVar="cluster",
grpVars=c(),
burdenVar="dpm",
wgtVar="pop",
exclfct="999"
) {
# FUNCTION ARGUMENTS
# df: the data frame
# fctVar: the factor variable
# grpVars: the variable that the data are aurrently at (e.g., "fipsCounty" for county-level in df)
# burdenVar: the disease burden variable for sorting
# wgtVar: the weight variable for sorting
# exclfct: the factor lele to be excluded from analysis
# General approach
# 1. Data are aggregated to c(fctVar, grpVars) as x=sum(burdenVar*wgtVar) and y=mean(wgtVar)
# 2. Data are aggregated to fctVar as z=sum(x)/sum(y)
# 3. Factors are reordered from high to low on z, with the excluded factor added back last (if it exists)
# Check if exclfct exists in the factor variable
fctDummy <- exclfct %in% levels(df[, fctVar, drop=TRUE])
# Create the summary of impact by segment
newLevels <- df %>%
filter(get(fctVar) != exclfct) %>%
group_by_at(vars(all_of(c(fctVar, grpVars)))) %>%
summarize(x=sum(get(burdenVar)*get(wgtVar)), y=mean(get(wgtVar))) %>%
group_by_at(vars(all_of(fctVar))) %>%
summarize(z=sum(x)/sum(y)) %>%
arrange(-z) %>%
pull(fctVar) %>%
as.character()
# Add back the dummy factor at the end (if it exists)
if (fctDummy) newLevels <- c(newLevels, exclfct)
# Reassign the levels in df
df %>%
mutate(!!fctVar:=factor(get(fctVar), levels=newLevels, labels=newLevels))
}
# Create the cluster-state data
helperMakeClusterStateData <- function(dfPlot,
dfPop=usmap::countypop,
dfBurden=countyDailyPerCapita,
orderCluster=FALSE
) {
# FUNCTION ARGUMENTS:
# dfPlot: the raw plotting data (which can have factors already reordered)
# dfPop: source for full county data with population
# dfBurden: source for disease burden by geography
# orderCluster: boolean, whether to order the cluster factor order by burden
# Merge in the counties that were previously excluded from segmentation
df <- dfPlot %>%
filter(date==max(date)) %>%
select(fipsCounty=state, cluster) %>%
mutate(fipsCounty=stringr::str_pad(fipsCounty, width=5, side="left", pad="0")) %>%
right_join(select(dfPop, fipsCounty=fips, state=abbr, countyName=county, pop=pop_2015)) %>%
mutate(cluster=factor(ifelse(is.na(cluster), 999, as.character(cluster))))
# Merge in the disease data
df <- dfBurden %>%
mutate(fipsCounty=stringr::str_pad(state, width=5, side="left", pad="0")) %>%
select(-state, -population) %>%
right_join(df)
# Order if requested
if (orderCluster) df <- changeOrderLabel(df, grpVars="fipsCounty")
# Return the relevant frame
df
}
# Helper function to read and convert
helperReadConvert <- function(file,
valueName
) {
# FUNCTION ARGUMENTS:
# file: the file for reading and converting
# valueName: name for the values column of the pivoted data
# Read file
df <- readr::read_csv(file)
glimpse(df)
# Conversion of the raw data
dfPivot <- df %>%
rename(countyName=`County Name`, state=State) %>%
pivot_longer(-c(countyFIPS, countyName, state, stateFIPS),
names_to="date",
values_to=valueName
) %>%
mutate(date=lubridate::mdy(date))
glimpse(dfPivot)
# Conversion of the pivoted data
dfConverted <- countyLevelEvolution(dfPivot,
burdenVar=valueName,
inclStates=NULL,
topN=5,
printPlot=FALSE,
returnData=TRUE
)
# Return the converted file
dfConverted
}
# Function to read and convert raw data from USA Facts
readUSAFacts <- function(caseFile,
deathFile,
stateClusters=NULL
) {
# FUNCTION ARGUMENTS:
# caseFile: the location of the downloaded cases dataset
# deathsFile: the location of the downloaded deaths dataset
# stateClusters: a field 'cluster' will be created from this named vector (if NULL, 'cluster' will be NA)
# Read cases file
cnvCases <- helperReadConvert(caseFile, valueName="cumCases")
# Read deaths file
cnvDeaths <- helperReadConvert(deathFile, valueName="cumDeaths")
# Join the files so there is countyFIPS-county-population-date and cumCases cumDeaths
# Also, add the state segments as 'cluster' if requested
dfBurden <- cnvDeaths %>%
select(-countyName.x, -countyName.y, -bold, cumDeathPer=burden) %>%
inner_join(cnvCases %>%
select(-countyName.x, -countyName.y, -bold, cumCasesPer=burden),
by=c("countyFIPS", "stateFIPS", "county", "state", "date", "population")
) %>%
mutate(cluster=if(is.null(stateClusters)) NA else stateClusters[state])
# Return the burdens file
dfBurden
}
# Function to plot the burden by cluster
plotBurdenData <- function(df,
maxDate,
minPop
) {
# FUNCTION ARGUMENTS:
# df: the burden data
# maxDate: the maximum date
# minPop: the minimum population
# Create a date label
dateLabel <- format(as.Date(maxDate), "%b %d, %Y")
# Plot the data as of maxDate
p1 <- df %>%
filter(date==maxDate, population>=minPop) %>%
ggplot(aes(x=cumCasesPer, y=cumDeathPer)) +
geom_point(aes(size=log10(population), color=factor(cluster)), alpha=0.25) +
geom_smooth(method="lm", se=FALSE, aes(weight=population, color=factor(cluster))) +
scale_size_continuous("log10\nCounty\nPop") +
labs(x=paste0("Cases per million as of ", dateLabel),
y=paste0("Deaths per million as of ", dateLabel),
title=paste0("County-level per-capita coronavirus burden as of ", dateLabel),
subtitle=paste0("Filtered to counties with population of at least ", minPop)
) +
facet_wrap(~cluster)
print(p1)
# Further, the total burden by cluster is plotted
p2 <- df %>%
filter(date==maxDate, population>=0) %>%
group_by(cluster) %>%
summarize(population=sum(population),
cumCases=sum(cumCases),
cumDeath=sum(cumDeaths),
mdn_CasesPer=median(cumCasesPer),
mdn_DeathPer=median(cumDeathPer)
) %>%
mutate(mean_CasesPer=1000000*cumCases/population, mean_DeathPer=1000000*cumDeath/population) %>%
select(cluster, starts_with("mdn"), starts_with("mean")) %>%
pivot_longer(-cluster) %>%
mutate(aggType=stringr::str_replace(name, "_.*", ""),
metType=stringr::str_replace(name, ".*_", "")
) %>%
pivot_wider(c(cluster, aggType), names_from="metType", values_from="value") %>%
ggplot(aes(x=CasesPer, y=DeathPer)) +
geom_point(aes(color=factor(cluster)), size=5) +
labs(x=paste0("Cases per million as of ", dateLabel),
y=paste0("Deaths per million as of ", dateLabel),
title=paste0("Cluster-level per-capita coronavirus burden as of ", dateLabel),
subtitle="State-level clusters based on hierarchical (method=`complete`)"
) +
scale_color_discrete("Cluster") +
ylim(c(0, NA)) +
xlim(c(0, NA)) +
facet_wrap(~c("mdn"="Median of all counties in segment", "mean"="Segment-level metric")[aggType])
print(p2)
}
# Create the filtered county data frame
createCountyFiltered <- function(df,
maxDate,
minPop
) {
# FUNCTION ARGUMENTS:
# df: data frame with the burden data
# maxDate: the last date to use
# minPop: the minimum population to use
# STEP 1: Select only desired variables from df
df <- df %>%
select(state=countyFIPS, date, cpm=cumCasesPer, dpm=cumDeathPer, population) %>%
arrange(state, date)
# STEP 1a: Confirm that there are no duplicates and that every county has the same dates
# This should be 1 provided that there are no duplicates
cat("\nThis should be 1, otherwise there are duplicates\n")
df %>%
count(state, date) %>%
pull(n) %>%
max() %>%
print()
cat("\n\n\n")
# This should have no standard deviation if the same number of records exist on every day
cat("\nMin and max should be the same if there are records everywhere on every day\n")
df %>%
mutate(n=1) %>%
group_by(date) %>%
summarize(n=sum(n), population=sum(population)) %>%
summarize_at(vars(all_of(c("n", "population"))), .funs=list(min=min, max=max)) %>%
print()
cat("\n\n\n")
# STEP 2: Convert to daily new totals rather than cumulative data
dfDaily <- df %>%
group_by(state) %>%
arrange(date) %>%
mutate_at(vars(all_of(c("cpm", "dpm"))), ~ifelse(row_number()==1, ., .-lag(.))) %>%
ungroup()
# STEP 2a: Add rolling 7 aggregates and total cases/deaths
dfDaily <- dfDaily %>%
arrange(state, date) %>%
group_by(state) %>%
helperRollingAgg(origVar="cpm", newName="cpm7", k=7) %>%
helperRollingAgg(origVar="dpm", newName="dpm7", k=7) %>%
ungroup() %>%
mutate(cases=cpm*population/1000000, deaths=dpm*population/1000000)
# STEP 3: Filter the data frame for date and minimum population
# Ensure that 'state' (which holds countyFIPS) is not summed as a double
dfFiltered <- dfDaily %>%
filter(population >= minPop, date <= as.Date(maxDate)) %>%
mutate(state=as.character(state))
# Check number of counties that will fail the test for 100 deaths per million or 5000 cases per million
is0 <- function(x) mean(x==0)
isltn <- function(x, n) mean(x<n)
islt100 <- function(x) isltn(x, n=100)
islt5000 <- function(x) isltn(x, n=5000)
dfFiltered %>%
group_by(state) %>%
summarize_at(c("cpm", "dpm"), sum) %>%
ungroup() %>%
summarize_at(vars(all_of(c("cpm", "dpm"))),
.funs=list(mean_is0=is0, mean_lt100=islt100, mean_lt5000=islt5000)
) %>%
print()
# Return the data file
dfFiltered
}
helperCorrel <- function(lagDays, df=clusterMarchMay, x="cpm7", y="dpm7") {
df %>%
group_by(cluster) %>%
arrange(date) %>%
mutate(xlag=lag(get(x), lagDays)) %>%
summarize(corr=cor(xlag, get(y), use="complete.obs")) %>%
mutate(lag=lagDays) %>%
ungroup()
}
# Helper function to make normalized data for cases and deaths
helperMakeNormData <- function(df,
aggBy="cluster",
plotData=TRUE
) {
# FUNCTION ARGUMENTS:
# df: data frame or tibble
# aggBy: variable for aggregation for the normalized data
# plotData: boolean, whether to plot the data
# Create the normalized data
normData <- df %>%
group_by_at(vars(all_of(c(aggBy, "date")))) %>%
summarize(cpm7=sum(pop*cpm7)/sum(pop), dpm7=sum(pop*dpm7)/sum(pop), pop=sum(pop)) %>%
group_by_at(vars(all_of(aggBy))) %>%
mutate(caseNorm=100*cpm7/max(cpm7, na.rm=TRUE), deathNorm=100*dpm7/max(dpm7, na.rm=TRUE)) %>%
ungroup()
# Plot the normalized data (if requested)
if (plotData) {
# Create the plotting object
p1 <- normData %>%
select(all_of(aggBy), date, caseNorm, deathNorm) %>%
pivot_longer(-c(all_of(aggBy), "date")) %>%
filter(!is.na(value)) %>%
ggplot(aes(x=date,
y=value,
color=c("deathNorm"="Normalized Death", "caseNorm"="Normalized Cases")[name],
group=name
)
) +
geom_line() +
facet_wrap(~get(aggBy)) +
scale_color_discrete("Metric") +
scale_x_date(date_breaks="1 months", date_labels="%m") +
labs(x="2020",
y="Normalized Burden",
title="Burden by Segment",
subtitle="Normalized (100 is segment maximum for metric)"
)
print(p1)
}
# Return the normalized data
normData
}
# Helper function to assess the correlations by lag
helperCorrel <- function(lagDays, df, aggBy="cluster", x="cpm7", y="dpm7") {
df %>%
group_by_at(vars(all_of(aggBy))) %>%
arrange(date) %>%
mutate(xlag=lag(get(x), lagDays)) %>%
summarize(corr=cor(xlag, get(y), use="complete.obs")) %>%
mutate(lag=lagDays) %>%
ungroup()
}
# Function to test lags and produce subsetted frame
helperTestLags <- function(df,
minDate=NULL,
maxDate=NULL,
aggBy="cluster",
maxRatio=1,
minRatio=-0.05,
plotData=TRUE
) {
# FUNCTION ARGUMENTS:
# df: data frame or tibble
# minDate: the minimum date to use for analysis (NULL means use data minimum)
# maxDate: the maximum date to use for analysis (NULL means use data minimum)
# aggBy: the aggregation level for the input data in df
# maxRatio: the maximum ratio of deaths to cases (flat-line at this ratio if above)
# minRatio: the minimum ratio of deaths to cases (flat-line at this ratio if below)
# plotData: boolean, whether to plot the subset of the data
# Get the minimum and/or maximum date from the data if passed as NULL
if (is.null(minDate)) minDate <- df %>% pull(date) %>% min(na.rm=TRUE) else minDate <- as.Date(minDate)
if (is.null(maxDate)) maxDate <- df %>% pull(date) %>% max(na.rm=TRUE) else maxDate <- as.Date(maxDate)
# Filter the data to only the desired columns and time periods
filterData <- df %>%
select(all_of(aggBy), date, cpm7, dpm7) %>%
filter(!is.na(cpm7), !is.na(dpm7), date>=minDate, date<=maxDate)
# Create the plot data if requested
if (plotData) {
p1 <- filterData %>%
pivot_longer(-c(all_of(aggBy), date)) %>%
group_by_at(vars(all_of(c(aggBy, "name")))) %>%
mutate(normValue=100*value/max(value)) %>%
filter(!is.na(normValue)) %>%
ggplot(aes(x=date,
y=normValue,
color=c("dpm7"="Normalized Death", "cpm7"="Normalized Cases")[name],
group=name
)
) +
geom_line() +
facet_wrap(~get(aggBy)) +
scale_color_discrete("Metric") +
scale_x_date(date_breaks="1 months", date_labels="%m") +
labs(x="2020",
y="Normalized Burden",
title=paste0("Burden by Segment (",
format(minDate, "%b %d"),
" - ",
format(maxDate, "%b %d"),
")"
),
subtitle="Normalized (100 is segment maximum for metric during time period)"
)
print(p1)
}
# Test the various lags for correlations
corrLagEarly <- map_dfr(.x=0:30, .f=helperCorrel, df=filterData, aggBy=aggBy)
# Plot of correlations
p2 <- corrLagEarly %>%
ggplot(aes(x=lag, y=corr)) +
geom_line(aes(group=get(aggBy), color=get(aggBy))) +
facet_wrap(~get(aggBy)) +
labs(x="Lag (days) for cases",
y="Correlation to deaths",
title="Correlation of lagged cases and deaths"
)
print(p2)
# Find the best lags for each aggBy
bestLag <- corrLagEarly %>%
group_by_at(vars(all_of(aggBy))) %>%
filter(corr==max(corr)) %>%
ungroup()
cat("\nThe best lags are:\n")
print(bestLag)
# Create the lagged database
lagData <- filterData %>%
inner_join(bestLag, by=aggBy) %>%
group_by_at(vars(all_of(aggBy))) %>%
arrange(date) %>%
mutate(cpmlag=lag(cpm7, min(lag)))
# Create the plot of the lagged database
p3 <- lagData %>%
filter(!is.na(cpmlag)) %>%
mutate(ratCur=pmax(minRatio, pmin(maxRatio, dpm7/cpmlag)),
ratCum=pmax(minRatio, pmin(maxRatio, cumsum(dpm7)/cumsum(cpmlag)))
) %>%
ggplot(aes(x=date, group=get(aggBy), color=get(aggBy))) +
geom_line(aes(y=ratCum)) +
geom_line(aes(y=ratCur), lty=2) +
labs(x="",
y="Cumulative Death vs. Case Ratio",
title="Deaths vs. Cases with Lag (Subsetted Time Period)",
subtitle="Solid line is cumulative, dashed line is current time period",
caption=paste0("Ratio of death to cases capped between ",
round(maxRatio, 3),
" and ",
round(minRatio, 3),
" for plotting")
) +
geom_text(data=~filter(., date==max(date)),
aes(label=paste0(round(100*ratCum, 1), "%"),
y=ratCum,
x=date+lubridate::days(3)
),
hjust=0
) +
scale_x_date(date_breaks="1 months", date_labels="%m", expand=expand_scale(mult=c(0, 0.2))) +
facet_wrap(~get(aggBy))
print(p3)
# Return the lagged data
lagData
}
# Explore the top-n counties by population in a subset of states
exploreTopCounties <- function(df,
minDate,
maxDate,
minPop=100000,
states=state.abb,
nVar="pop",
nKey=9
) {
# FUNCTION ARGUMENTS:
# df: the data frame or tibble containing county (as variable 'state'), cluster, pop, date, dpm, cpm
# minDate: minimum date for lags
# maxDate: maximum date for lags
# states: the states for filtering
# minPop: minimum population for including as a top county
# nVar: the variable to use for top-n (should be "pop" or "dpm")
# nKey: the number of counties to pull for top-N
minDate <- as.Date(minDate)
maxDate <- as.Date(maxDate)
# Extract a list of the heaviest hit counties with population 1000k+
ctyFilter <- df %>%
mutate(state=str_pad(state, width=5, side="left", pad="0")) %>%
inner_join(select(usmap::countypop, -pop_2015), by=c("state"="fips")) %>%
mutate(countyName=paste0(cluster, " - ",
stringr::str_replace(county, "County|Parish", "("),
abbr,
")"
)
) %>%
filter(pop >= minPop, abbr %in% states) %>%
group_by(state, cluster, countyName) %>%
summarize(dpm=sum(dpm), pop=mean(pop)) %>%
ungroup() %>%
top_n(n=nKey, wt=get(nVar)) %>%
arrange(desc(get(nVar)))
# Display the list of key counties
cat("\n*** KEY COUNTIES INCLUDE: ***\n")
print(ctyFilter)
# Keep only the key counties and normalize the data
cNorm_key <- df %>%
mutate(state=str_pad(state, width=5, side="left", pad="0")) %>%
inner_join(select(ctyFilter, state, countyName), by=c("state"="state")) %>%
helperMakeNormData(aggBy=c("countyName", "state", "cluster"))
# Create lags for given time period
lag_key <- helperTestLags(cNorm_key,
minDate=minDate,
maxDate=maxDate,
aggBy=c("countyName", "state", "cluster"),
maxRatio=0.25
)
# Return the lagged data
lag_key
}
# Function to read and pivot input data from USA Facts
readPivotUSAFacts <- function(popFile,
caseFile,
deathFile,
unassignedDate
) {
# FUNCTION ARGUMENTS:
# popFile: population file
# caseFile: case file
# deathFile: deaths file
# unassignedDate: date to be used for extracting unassigned summaries (generally, last complete date)
# Read population file
pop <- readr::read_csv(popFile) %>%
rename(countyName=`County Name`, state=State)
# Function for reading and pivoting data
helperReadPivot <- function(x, valueName) {
readr::read_csv(x) %>%
rename(countyName=`County Name`, state=State) %>%
pivot_longer(-c(countyFIPS, countyName, state, stateFIPS),
names_to="date",
values_to=valueName
) %>%
mutate(date=lubridate::mdy(date))
}
# Read cases file
cases <- helperReadPivot(caseFile, valueName="cumCases")
# Read deaths file
deaths <- helperReadPivot(deathFile, valueName="cumDeaths")
# Assess the extent of the unassigned data issue
unassigned <- cases %>%
mutate(unassigned=ifelse(countyFIPS==0, 1, 0)) %>%
group_by(state, date, unassigned) %>%
summarize(cases=sum(cumCases)) %>%
ungroup() %>%
full_join(deaths %>%
mutate(unassigned=ifelse(countyFIPS==0, 1, 0)) %>%
group_by(state, date, unassigned) %>%
summarize(deaths=sum(cumDeaths)) %>%
ungroup()
)
# Create plot for the unassigned data
p1 <- unassigned %>%
filter(date==unassignedDate) %>%
pivot_longer(c("cases", "deaths")) %>%
group_by(state, name) %>%
summarize(total=sum(value), unass=sum(value*unassigned)/sum(value)) %>%
ggplot(aes(x=fct_reorder(state, unass*ifelse(name=="cases", 1, 0)), y=unass)) +
geom_point() +
geom_text(data=~filter(., (name=="cases" & unass >= 0.02) | (name=="deaths" & unass > 0.005)),
aes(y=unass + ifelse(name=="cases", -0.0025, -0.001),
label=paste0(round(100*unass, 1), "% (", state ,")")
),
size=3,
hjust=1
) +
coord_flip() +
labs(x="",
y=paste0("% Unassigned as of ", as.character(unassignedDate)),
title="Unassigned percentage by state"
) +
facet_wrap(~name, scales="free_x")
print(p1)
# Return list containing the three key files
list(pop=pop, cases=cases, deaths=deaths, unassigned=unassigned)
}
# STEP 4: Create county-level clusters
prepClusterCounties <- function(burdenFile,
maxDate,
minPop,
hierarchical=FALSE,
returnList=!hierarchical,
...
) {
# FUNCTION ARGUMENTS:
# burdenFile: the pivoted file containing the burdens data
# maxDate: the latest date to be used from the data
# minPop: the smallest population for a county to be included
# hierarchical: whether to create hierarchical clusters (only set up for FALSE as of now)
# returnList: whether the clustering call returns a list (only set up for TRUE as of now)
# ...: other arguments to be passed to clusterStates
# STEP 1: Select only desired variables from burden file
countyCumPerCapita <- burdenFile %>%
select(state=countyFIPS, date, cpm=cumCasesPer, dpm=cumDeathPer, population) %>%
arrange(state, date)
# STEP 2: Confirm that there are no duplicates and that every county has the same dates
# This should be 1 provided that there are no duplicates
countyCumPerCapita %>%
count(state, date) %>%
pull(n) %>%
max()
# This should have no standard deviation if the same number of records exist on every day
countyCumPerCapita %>%
mutate(n=1) %>%
group_by(date) %>%
summarize(n=sum(n), population=sum(population)) %>%
summarize_at(vars(all_of(c("n", "population"))), .funs=list(sd=sd, min=min, max=max))
# STEP 3: Convert to daily new totals rather than cumulative data
countyDailyPerCapita <- countyCumPerCapita %>%
group_by(state) %>%
arrange(date) %>%
mutate_at(vars(all_of(c("cpm", "dpm"))), ~ifelse(row_number()==1, ., .-lag(.))) %>%
ungroup()
# STEP 4: Add rolling 7 aggregates and total cases/deaths
countyDailyPerCapita <- countyDailyPerCapita %>%
arrange(state, date) %>%
group_by(state) %>%
helperRollingAgg(origVar="cpm", newName="cpm7", k=7) %>%
helperRollingAgg(origVar="dpm", newName="dpm7", k=7) %>%
ungroup() %>%
mutate(cases=cpm*population/1000000, deaths=dpm*population/1000000)
# STEP 5: Filter the data prior to clustering
countyFiltered <- countyDailyPerCapita %>%
filter(population >= minPop, date <= as.Date(maxDate)) %>%
mutate(state=as.character(state))
# STEP 6: Check the implications of the filtering
# Check number of counties that will fail the test for 100 deaths per million or 5000 cases per million
is0 <- function(x) mean(x==0)
isltn <- function(x, n) mean(x<n)
islt100 <- function(x) isltn(x, n=100)
islt5000 <- function(x) isltn(x, n=5000)
countyFiltered %>%
group_by(state) %>%
summarize_at(c("cpm", "dpm"), sum) %>%
ungroup() %>%
summarize_at(vars(all_of(c("cpm", "dpm"))),
.funs=list(mean_is0=is0, mean_lt100=islt100, mean_lt5000=islt5000)
) %>%
print()
# Run county-level clusters and return the output along with the intermediate data file
list(objCluster=clusterStates(countyFiltered, hierarchical=hierarchical, returnList=returnList, ...),
countyFiltered=countyFiltered,
countyDailyPerCapita=countyDailyPerCapita,
countyCumPerCapita=countyCumPerCapita
)
}
The data from USA Facts can be loaded and analyzed using the functions above, a variable mapping file, and a main function that calls the other functions and returns a list:
# STEP 1a: Define the locations for the population, cases, and deaths file
popFile <- "./RInputFiles/Coronavirus/covid_county_population_usafacts.csv"
caseFile <- "./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20200903.csv"
deathFile <- "./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20200903.csv"
maxDate <- "2020-08-31"
# STEP 1b: Read and pivot data from USA Facts; extract population data file as pop_usafacts
rawUSAFacts <- readPivotUSAFacts(popFile=popFile,
caseFile=caseFile,
deathFile=deathFile,
unassignedDate=maxDate
)
## Parsed with column specification:
## cols(
## countyFIPS = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## population = col_double()
## )
## Parsed with column specification:
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## See spec(...) for full column specifications.
## Joining, by = c("state", "date", "unassigned")
pop_usafacts <- rawUSAFacts$pop
# STEP 2: Read case and death data (redundant), combine, and add population totals; no clusters by default
burden_20200903_new <- readUSAFacts(
caseFile=caseFile,
deathFile=deathFile,
stateClusters=NULL
)
## Parsed with column specification:
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## See spec(...) for full column specifications.
## Observations: 3,195
## Variables: 228
## $ countyFIPS <dbl> 0, 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 10...
## $ `County Name` <chr> "Statewide Unallocated", "Autauga County", "Baldwin C...
## $ State <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",...
## $ stateFIPS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ `1/22/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/23/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/24/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/25/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/26/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/27/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/28/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/29/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/30/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/31/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/1/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/2/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/3/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/4/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/5/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/6/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/7/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/8/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/9/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/10/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/11/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/12/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/13/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/14/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/15/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/16/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/17/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/18/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/19/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/20/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/21/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/22/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/23/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/24/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/25/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/26/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/27/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/28/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/29/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/1/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/2/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/3/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/4/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/5/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/6/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/7/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/8/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/9/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/10/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/11/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/12/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/13/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/14/20` <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/15/20` <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/16/20` <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/17/20` <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/18/20` <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/19/20` <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/20/20` <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/21/20` <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/22/20` <dbl> 0, 0, 3, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/23/20` <dbl> 0, 0, 3, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/24/20` <dbl> 0, 1, 4, 0, 0, 0, 0, 0, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/25/20` <dbl> 0, 4, 4, 0, 0, 1, 0, 1, 2, 10, 1, 1, 0, 0, 1, 1, 0, 1...
## $ `3/26/20` <dbl> 0, 6, 5, 0, 0, 2, 2, 1, 2, 13, 1, 4, 1, 0, 1, 1, 0, 1...
## $ `3/27/20` <dbl> 0, 6, 5, 0, 0, 5, 2, 1, 3, 15, 1, 7, 1, 0, 1, 3, 0, 1...
## $ `3/28/20` <dbl> 0, 6, 10, 0, 0, 5, 2, 1, 3, 17, 1, 7, 1, 0, 2, 4, 0, ...
## $ `3/29/20` <dbl> 0, 6, 15, 0, 0, 5, 2, 1, 3, 27, 2, 8, 1, 0, 2, 5, 0, ...
## $ `3/30/20` <dbl> 0, 7, 18, 0, 2, 5, 2, 1, 9, 36, 2, 10, 2, 0, 2, 5, 0,...
## $ `3/31/20` <dbl> 0, 7, 19, 0, 3, 5, 2, 1, 9, 36, 2, 11, 3, 0, 2, 5, 0,...
## $ `4/1/20` <dbl> 0, 10, 23, 0, 3, 5, 2, 1, 11, 45, 2, 13, 4, 2, 3, 6, ...
## $ `4/2/20` <dbl> 0, 10, 25, 0, 4, 6, 2, 1, 12, 67, 4, 14, 4, 2, 7, 6, ...
## $ `4/3/20` <dbl> 0, 12, 28, 1, 4, 9, 2, 1, 20, 81, 5, 15, 4, 3, 8, 7, ...
## $ `4/4/20` <dbl> 0, 12, 29, 2, 4, 10, 2, 1, 21, 87, 6, 15, 4, 7, 9, 7,...
## $ `4/5/20` <dbl> 0, 12, 34, 2, 7, 10, 2, 1, 24, 90, 6, 18, 5, 9, 9, 7,...
## $ `4/6/20` <dbl> 0, 12, 38, 3, 7, 10, 2, 1, 38, 96, 6, 20, 6, 9, 9, 9,...
## $ `4/7/20` <dbl> 0, 12, 42, 3, 8, 10, 2, 2, 48, 102, 6, 20, 6, 10, 9, ...
## $ `4/8/20` <dbl> 0, 12, 49, 3, 9, 10, 3, 3, 52, 140, 7, 22, 6, 10, 11,...
## $ `4/9/20` <dbl> 0, 17, 59, 7, 11, 11, 4, 3, 54, 161, 7, 25, 6, 13, 11...
## $ `4/10/20` <dbl> 0, 17, 59, 9, 11, 12, 4, 3, 54, 171, 7, 27, 7, 13, 11...
## $ `4/11/20` <dbl> 0, 19, 66, 10, 13, 12, 4, 6, 57, 184, 7, 30, 9, 15, 1...
## $ `4/12/20` <dbl> 0, 19, 71, 10, 16, 13, 4, 7, 60, 200, 9, 30, 10, 19, ...
## $ `4/13/20` <dbl> 0, 19, 78, 10, 17, 15, 6, 8, 61, 212, 9, 33, 10, 19, ...
## $ `4/14/20` <dbl> 0, 23, 87, 11, 17, 16, 8, 8, 62, 216, 9, 33, 12, 21, ...
## $ `4/15/20` <dbl> 0, 25, 98, 13, 19, 17, 8, 11, 62, 227, 10, 37, 13, 22...
## $ `4/16/20` <dbl> 0, 25, 102, 14, 23, 18, 8, 11, 63, 234, 11, 37, 13, 2...
## $ `4/17/20` <dbl> 0, 25, 103, 15, 23, 20, 8, 13, 63, 236, 12, 37, 13, 2...
## $ `4/18/20` <dbl> 0, 25, 109, 18, 26, 20, 9, 13, 66, 240, 12, 39, 14, 2...
## $ `4/19/20` <dbl> 0, 27, 114, 20, 28, 21, 9, 14, 72, 246, 12, 42, 14, 2...
## $ `4/20/20` <dbl> 0, 28, 117, 22, 32, 22, 11, 14, 80, 257, 12, 43, 17, ...
## $ `4/21/20` <dbl> 0, 30, 123, 28, 32, 26, 11, 15, 83, 259, 12, 44, 18, ...
## $ `4/22/20` <dbl> 0, 32, 132, 29, 33, 29, 11, 17, 85, 270, 12, 46, 21, ...
## $ `4/23/20` <dbl> 0, 33, 143, 30, 33, 31, 12, 19, 88, 275, 12, 47, 22, ...
## $ `4/24/20` <dbl> 0, 36, 147, 32, 34, 31, 12, 21, 89, 282, 12, 49, 25, ...
## $ `4/25/20` <dbl> 0, 37, 154, 33, 35, 31, 12, 28, 90, 284, 12, 49, 27, ...
## $ `4/26/20` <dbl> 0, 37, 161, 33, 38, 34, 12, 32, 90, 285, 14, 51, 32, ...
## $ `4/27/20` <dbl> 0, 39, 168, 35, 42, 34, 12, 34, 90, 289, 14, 51, 39, ...
## $ `4/28/20` <dbl> 0, 40, 171, 37, 42, 34, 12, 45, 92, 290, 15, 52, 39, ...
## $ `4/29/20` <dbl> 0, 42, 173, 37, 42, 36, 12, 51, 93, 290, 15, 52, 39, ...
## $ `4/30/20` <dbl> 0, 42, 174, 39, 42, 37, 13, 53, 93, 290, 15, 52, 43, ...
## $ `5/1/20` <dbl> 0, 42, 175, 42, 42, 39, 14, 65, 93, 290, 15, 52, 49, ...
## $ `5/2/20` <dbl> 0, 45, 181, 43, 42, 40, 14, 92, 98, 294, 15, 54, 49, ...
## $ `5/3/20` <dbl> 0, 48, 187, 45, 42, 40, 14, 105, 105, 300, 16, 57, 49...
## $ `5/4/20` <dbl> 0, 53, 188, 45, 42, 40, 16, 114, 105, 302, 16, 58, 51...
## $ `5/5/20` <dbl> 0, 53, 189, 47, 43, 40, 18, 120, 114, 304, 17, 60, 54...
## $ `5/6/20` <dbl> 0, 58, 196, 47, 43, 42, 18, 130, 114, 306, 18, 61, 54...
## $ `5/7/20` <dbl> 0, 61, 205, 51, 44, 44, 18, 155, 120, 308, 18, 63, 56...
## $ `5/8/20` <dbl> 0, 67, 208, 53, 44, 44, 21, 162, 123, 311, 21, 63, 59...
## $ `5/9/20` <dbl> 0, 68, 216, 58, 45, 44, 22, 178, 124, 314, 22, 64, 61...
## $ `5/10/20` <dbl> 0, 74, 222, 59, 46, 44, 23, 189, 124, 316, 22, 65, 66...
## $ `5/11/20` <dbl> 0, 84, 224, 61, 46, 45, 26, 196, 125, 319, 24, 67, 67...
## $ `5/12/20` <dbl> 0, 91, 227, 67, 46, 45, 26, 224, 126, 324, 24, 69, 69...
## $ `5/13/20` <dbl> 0, 93, 231, 69, 46, 45, 28, 230, 127, 324, 24, 73, 72...
## $ `5/14/20` <dbl> 0, 103, 243, 74, 46, 45, 28, 249, 128, 326, 25, 74, 7...
## $ `5/15/20` <dbl> 0, 103, 244, 79, 49, 45, 32, 258, 129, 326, 26, 75, 8...
## $ `5/16/20` <dbl> 0, 110, 254, 79, 50, 45, 35, 271, 130, 328, 27, 77, 8...
## $ `5/17/20` <dbl> 0, 110, 254, 81, 50, 46, 35, 272, 130, 328, 27, 77, 8...
## $ `5/18/20` <dbl> 0, 120, 260, 85, 50, 47, 40, 285, 133, 329, 28, 79, 8...
## $ `5/19/20` <dbl> 0, 127, 262, 90, 51, 47, 52, 295, 133, 329, 29, 80, 9...
## $ `5/20/20` <dbl> 0, 136, 270, 96, 52, 47, 64, 312, 136, 330, 30, 83, 1...
## $ `5/21/20` <dbl> 0, 147, 270, 100, 52, 48, 71, 321, 136, 330, 31, 84, ...
## $ `5/22/20` <dbl> 0, 149, 271, 104, 55, 49, 89, 329, 137, 330, 33, 85, ...
## $ `5/23/20` <dbl> 0, 155, 273, 105, 58, 49, 105, 335, 138, 330, 33, 86,...
## $ `5/24/20` <dbl> 0, 159, 274, 110, 59, 49, 111, 344, 141, 336, 33, 87,...
## $ `5/25/20` <dbl> 0, 173, 276, 116, 62, 49, 141, 368, 147, 337, 33, 87,...
## $ `5/26/20` <dbl> 0, 189, 277, 122, 66, 51, 167, 380, 150, 338, 33, 90,...
## $ `5/27/20` <dbl> 0, 192, 281, 130, 71, 53, 176, 391, 152, 340, 33, 93,...
## $ `5/28/20` <dbl> 0, 205, 281, 132, 71, 58, 185, 392, 152, 349, 34, 97,...
## $ `5/29/20` <dbl> 0, 212, 282, 147, 71, 60, 201, 396, 153, 352, 36, 99,...
## $ `5/30/20` <dbl> 0, 216, 283, 150, 72, 61, 203, 402, 154, 353, 37, 100...
## $ `5/31/20` <dbl> 0, 220, 288, 164, 75, 62, 209, 410, 157, 355, 37, 100...
## $ `6/1/20` <dbl> 0, 233, 292, 172, 76, 63, 209, 414, 164, 358, 38, 103...
## $ `6/2/20` <dbl> 0, 238, 292, 175, 76, 63, 212, 416, 165, 358, 38, 104...
## $ `6/3/20` <dbl> 0, 239, 292, 177, 76, 63, 215, 419, 165, 359, 38, 105...
## $ `6/4/20` <dbl> 0, 241, 293, 177, 76, 63, 217, 421, 167, 360, 38, 107...
## $ `6/5/20` <dbl> 0, 248, 296, 183, 76, 64, 219, 431, 169, 363, 38, 108...
## $ `6/6/20` <dbl> 0, 259, 304, 190, 77, 70, 225, 442, 174, 373, 40, 108...
## $ `6/7/20` <dbl> 0, 265, 313, 193, 77, 72, 232, 449, 176, 378, 42, 110...
## $ `6/8/20` <dbl> 0, 272, 320, 197, 79, 73, 238, 455, 178, 383, 42, 111...
## $ `6/9/20` <dbl> 0, 282, 325, 199, 85, 75, 243, 464, 180, 391, 42, 117...
## $ `6/10/20` <dbl> 0, 295, 331, 208, 89, 79, 248, 471, 182, 401, 42, 118...
## $ `6/11/20` <dbl> 0, 312, 343, 214, 93, 87, 253, 484, 184, 417, 42, 121...
## $ `6/12/20` <dbl> 0, 323, 353, 221, 97, 95, 258, 499, 188, 427, 46, 122...
## $ `6/13/20` <dbl> 0, 331, 361, 226, 100, 102, 276, 517, 190, 438, 47, 1...
## $ `6/14/20` <dbl> 0, 357, 364, 234, 104, 110, 302, 536, 195, 453, 51, 1...
## $ `6/15/20` <dbl> 0, 368, 383, 238, 111, 116, 307, 544, 204, 475, 53, 1...
## $ `6/16/20` <dbl> 0, 373, 389, 245, 116, 121, 310, 551, 206, 485, 53, 1...
## $ `6/17/20` <dbl> 0, 375, 392, 251, 118, 123, 313, 554, 208, 486, 53, 1...
## $ `6/18/20` <dbl> 0, 400, 401, 263, 121, 130, 320, 566, 210, 501, 55, 1...
## $ `6/19/20` <dbl> 0, 411, 413, 266, 126, 139, 320, 569, 210, 507, 58, 1...
## $ `6/20/20` <dbl> 0, 431, 420, 272, 126, 143, 327, 572, 211, 516, 58, 1...
## $ `6/21/20` <dbl> 0, 434, 430, 272, 127, 149, 327, 576, 213, 521, 58, 1...
## $ `6/22/20` <dbl> 0, 442, 437, 277, 129, 153, 328, 578, 215, 528, 58, 1...
## $ `6/23/20` <dbl> 0, 453, 450, 280, 135, 159, 329, 581, 216, 534, 58, 1...
## $ `6/24/20` <dbl> 0, 469, 464, 288, 141, 168, 336, 584, 220, 543, 58, 1...
## $ `6/25/20` <dbl> 0, 479, 477, 305, 149, 176, 351, 588, 233, 549, 64, 1...
## $ `6/26/20` <dbl> 0, 488, 515, 312, 153, 184, 351, 594, 236, 559, 68, 1...
## $ `6/27/20` <dbl> 0, 498, 555, 317, 161, 188, 358, 600, 245, 561, 69, 2...
## $ `6/28/20` <dbl> 0, 503, 575, 317, 162, 189, 358, 602, 245, 561, 70, 2...
## $ `6/29/20` <dbl> 0, 527, 643, 322, 165, 199, 365, 605, 269, 585, 73, 2...
## $ `6/30/20` <dbl> 0, 537, 680, 325, 170, 208, 365, 607, 276, 590, 74, 2...
## $ `7/1/20` <dbl> 0, 553, 703, 326, 174, 218, 367, 607, 278, 595, 77, 2...
## $ `7/2/20` <dbl> 0, 561, 751, 335, 179, 222, 369, 610, 288, 611, 82, 2...
## $ `7/3/20` <dbl> 0, 568, 845, 348, 189, 230, 372, 625, 330, 625, 88, 2...
## $ `7/4/20` <dbl> 0, 591, 863, 350, 190, 234, 373, 626, 340, 637, 88, 2...
## $ `7/5/20` <dbl> 0, 615, 881, 352, 193, 239, 373, 634, 362, 642, 100, ...
## $ `7/6/20` <dbl> 0, 618, 911, 356, 197, 247, 373, 634, 384, 655, 105, ...
## $ `7/7/20` <dbl> 0, 644, 997, 360, 199, 255, 373, 634, 395, 656, 106, ...
## $ `7/8/20` <dbl> 0, 651, 1056, 366, 201, 262, 374, 639, 411, 660, 114,...
## $ `7/9/20` <dbl> 0, 661, 1131, 371, 211, 282, 375, 646, 445, 672, 115,...
## $ `7/10/20` <dbl> 0, 670, 1187, 381, 218, 292, 381, 648, 465, 679, 118,...
## $ `7/11/20` <dbl> 0, 684, 1224, 398, 224, 307, 382, 654, 500, 690, 128,...
## $ `7/12/20` <dbl> 0, 706, 1294, 403, 228, 331, 383, 655, 526, 693, 129,...
## $ `7/13/20` <dbl> 0, 728, 1359, 413, 231, 350, 383, 660, 566, 702, 136,...
## $ `7/14/20` <dbl> 0, 746, 1414, 428, 236, 366, 385, 661, 589, 712, 140,...
## $ `7/15/20` <dbl> 0, 756, 1518, 441, 242, 389, 386, 664, 655, 718, 145,...
## $ `7/16/20` <dbl> 0, 780, 1599, 459, 247, 424, 389, 669, 675, 731, 152,...
## $ `7/17/20` <dbl> 0, 789, 1689, 463, 255, 440, 393, 672, 720, 742, 157,...
## $ `7/18/20` <dbl> 0, 827, 1819, 483, 264, 458, 397, 678, 741, 756, 165,...
## $ `7/19/20` <dbl> 0, 842, 1937, 495, 269, 482, 398, 686, 785, 762, 173,...
## $ `7/20/20` <dbl> 0, 857, 2013, 503, 279, 507, 400, 689, 832, 767, 179,...
## $ `7/21/20` <dbl> 0, 865, 2102, 514, 283, 524, 401, 695, 869, 774, 182,...
## $ `7/22/20` <dbl> 0, 886, 2196, 518, 287, 547, 407, 701, 891, 782, 184,...
## $ `7/23/20` <dbl> 0, 905, 2461, 534, 289, 585, 408, 706, 934, 789, 193,...
## $ `7/24/20` <dbl> 0, 921, 2513, 539, 303, 615, 411, 711, 999, 797, 205,...
## $ `7/25/20` <dbl> 0, 932, 2662, 552, 318, 637, 414, 720, 1062, 810, 207...
## $ `7/26/20` <dbl> 0, 942, 2708, 562, 324, 646, 415, 724, 1113, 821, 209...
## $ `7/27/20` <dbl> 0, 965, 2770, 569, 334, 669, 416, 730, 1194, 825, 220...
## $ `7/28/20` <dbl> 0, 974, 2835, 575, 337, 675, 429, 734, 1243, 836, 221...
## $ `7/29/20` <dbl> 0, 974, 2835, 575, 338, 675, 429, 734, 1244, 836, 221...
## $ `7/30/20` <dbl> 0, 1002, 3028, 585, 352, 731, 435, 747, 1336, 848, 23...
## $ `7/31/20` <dbl> 0, 1015, 3101, 598, 363, 767, 437, 753, 1450, 859, 23...
## $ `8/1/20` <dbl> 0, 1030, 3142, 602, 368, 792, 443, 757, 1480, 861, 24...
## $ `8/2/20` <dbl> 0, 1052, 3223, 610, 372, 813, 445, 765, 1580, 868, 25...
## $ `8/3/20` <dbl> 0, 1066, 3265, 612, 382, 830, 446, 766, 1612, 875, 26...
## $ `8/4/20` <dbl> 0, 1073, 3320, 614, 389, 836, 449, 766, 1646, 882, 26...
## $ `8/5/20` <dbl> 0, 1073, 3380, 615, 392, 839, 452, 769, 1683, 886, 27...
## $ `8/6/20` <dbl> 0, 1096, 3438, 619, 421, 874, 458, 771, 1741, 893, 28...
## $ `8/7/20` <dbl> 0, 1113, 3504, 624, 424, 909, 462, 774, 1777, 899, 29...
## $ `8/8/20` <dbl> 0, 1134, 3564, 628, 434, 923, 471, 773, 1836, 904, 29...
## $ `8/9/20` <dbl> 0, 1215, 3606, 630, 446, 934, 472, 779, 1860, 906, 30...
## $ `8/10/20` <dbl> 0, 1215, 3714, 631, 450, 947, 474, 782, 1883, 909, 30...
## $ `8/11/20` <dbl> 0, 1215, 3736, 643, 455, 958, 489, 785, 1914, 916, 30...
## $ `8/12/20` <dbl> 0, 1241, 3776, 646, 464, 967, 500, 788, 1935, 918, 31...
## $ `8/13/20` <dbl> 0, 1250, 3813, 651, 469, 977, 501, 790, 1959, 919, 32...
## $ `8/14/20` <dbl> 0, 1252, 3860, 656, 477, 989, 502, 796, 1975, 922, 32...
## $ `8/15/20` <dbl> 0, 1262, 3909, 663, 483, 996, 503, 807, 2019, 925, 33...
## $ `8/16/20` <dbl> 0, 1273, 3948, 671, 483, 1005, 504, 811, 2037, 927, 3...
## $ `8/17/20` <dbl> 0, 1274, 3960, 672, 488, 1008, 504, 814, 2055, 928, 3...
## $ `8/18/20` <dbl> 0, 1291, 3977, 674, 490, 1034, 512, 814, 2107, 937, 3...
## $ `8/19/20` <dbl> 0, 1293, 4002, 683, 503, 1049, 530, 814, 2159, 941, 3...
## $ `8/20/20` <dbl> 0, 1293, 4035, 690, 507, 1077, 534, 814, 2214, 949, 3...
## $ `8/21/20` <dbl> 0, 1293, 4054, 690, 509, 1083, 534, 814, 2228, 952, 3...
## $ `8/22/20` <dbl> 0, 1322, 4115, 699, 516, 1096, 536, 822, 2276, 957, 3...
## $ `8/23/20` <dbl> 0, 1324, 4147, 702, 523, 1099, 536, 824, 2286, 958, 3...
## $ `8/24/20` <dbl> 0, 1351, 4167, 720, 526, 1135, 536, 825, 2327, 971, 3...
## $ `8/25/20` <dbl> 0, 1355, 4190, 724, 527, 1160, 536, 826, 2345, 973, 3...
## $ `8/26/20` <dbl> 0, 1366, 4265, 732, 530, 1195, 537, 833, 2400, 983, 3...
## $ `8/27/20` <dbl> 0, 1377, 4311, 739, 533, 1213, 538, 839, 2413, 1011, ...
## $ `8/28/20` <dbl> 0, 1389, 4347, 745, 535, 1219, 541, 840, 2443, 1017, ...
## $ `8/29/20` <dbl> 0, 1400, 4424, 753, 540, 1248, 546, 855, 2499, 1024, ...
## $ `8/30/20` <dbl> 0, 1438, 4525, 757, 550, 1277, 550, 864, 2533, 1027, ...
## $ `8/31/20` <dbl> 0, 1442, 4545, 757, 554, 1287, 551, 866, 2567, 1033, ...
## $ `9/1/20` <dbl> 0, 1453, 4568, 764, 558, 1303, 559, 871, 2619, 1041, ...
## Observations: 715,680
## Variables: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumCases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Parsed with column specification:
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## See spec(...) for full column specifications.
## Observations: 3,195
## Variables: 228
## $ countyFIPS <dbl> 0, 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 10...
## $ `County Name` <chr> "Statewide Unallocated", "Autauga County", "Baldwin C...
## $ State <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",...
## $ stateFIPS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ `1/22/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/23/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/24/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/25/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/26/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/27/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/28/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/29/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/30/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `1/31/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/1/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/2/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/3/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/4/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/5/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/6/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/7/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/8/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/9/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/10/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/11/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/12/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/13/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/14/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/15/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/16/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/17/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/18/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/19/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/20/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/21/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/22/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/23/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/24/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/25/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/26/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/27/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/28/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `2/29/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/1/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/2/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/3/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/4/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/5/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/6/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/7/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/8/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/9/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/10/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/11/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/12/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/13/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/14/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/15/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/16/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/17/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/18/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/19/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/20/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/21/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/22/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/23/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/24/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/25/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/26/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/27/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/28/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/29/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/30/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `3/31/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `4/1/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `4/2/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `4/3/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `4/4/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `4/5/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `4/6/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `4/7/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `4/8/20` <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `4/9/20` <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `4/10/20` <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ `4/11/20` <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ `4/12/20` <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ `4/13/20` <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ `4/14/20` <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ `4/15/20` <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ `4/16/20` <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ `4/17/20` <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 11, 0, 0, 0, 0, 0, 0, 0, 1...
## $ `4/18/20` <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 2, 11, 0, 0, 0, 0, 0, 0, 0, 1...
## $ `4/19/20` <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 2, 11, 0, 0, 0, 0, 0, 0, 0, 1...
## $ `4/20/20` <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 2, 11, 0, 0, 0, 0, 0, 0, 0, 1...
## $ `4/21/20` <dbl> 0, 1, 2, 0, 0, 0, 0, 0, 3, 13, 0, 0, 0, 1, 0, 0, 0, 1...
## $ `4/22/20` <dbl> 0, 1, 2, 0, 0, 0, 0, 0, 3, 16, 0, 0, 0, 1, 0, 1, 0, 1...
## $ `4/23/20` <dbl> 0, 2, 2, 0, 0, 0, 0, 0, 3, 16, 0, 1, 0, 1, 1, 1, 0, 1...
## $ `4/24/20` <dbl> 0, 2, 2, 0, 0, 0, 0, 0, 3, 17, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `4/25/20` <dbl> 0, 2, 2, 0, 0, 0, 0, 0, 3, 18, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `4/26/20` <dbl> 0, 2, 2, 0, 0, 0, 0, 1, 3, 18, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `4/27/20` <dbl> 0, 3, 2, 0, 0, 0, 0, 1, 3, 18, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `4/28/20` <dbl> 0, 3, 2, 0, 0, 0, 0, 1, 3, 19, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `4/29/20` <dbl> 0, 3, 2, 1, 0, 0, 0, 1, 3, 21, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `4/30/20` <dbl> 0, 3, 3, 1, 0, 0, 0, 1, 3, 21, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `5/1/20` <dbl> 0, 3, 4, 1, 0, 0, 0, 1, 3, 21, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `5/2/20` <dbl> 0, 3, 4, 1, 0, 0, 0, 1, 3, 21, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `5/3/20` <dbl> 0, 3, 4, 1, 0, 0, 0, 1, 3, 21, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `5/4/20` <dbl> 0, 3, 4, 1, 0, 0, 0, 1, 3, 21, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `5/5/20` <dbl> 0, 3, 5, 1, 0, 0, 0, 2, 3, 21, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `5/6/20` <dbl> 0, 3, 5, 1, 0, 0, 1, 2, 3, 21, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `5/7/20` <dbl> 0, 3, 5, 1, 0, 0, 1, 2, 3, 21, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `5/8/20` <dbl> 0, 3, 5, 1, 1, 0, 1, 3, 3, 21, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `5/9/20` <dbl> 0, 3, 5, 1, 1, 0, 1, 6, 3, 21, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `5/10/20` <dbl> 0, 3, 5, 1, 1, 0, 1, 6, 3, 21, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `5/11/20` <dbl> 0, 3, 6, 1, 1, 0, 1, 6, 3, 21, 0, 1, 0, 1, 1, 1, 0, 2...
## $ `5/12/20` <dbl> 0, 3, 7, 1, 1, 0, 1, 6, 3, 21, 0, 1, 1, 1, 1, 1, 1, 2...
## $ `5/13/20` <dbl> 0, 3, 7, 1, 1, 0, 1, 6, 3, 22, 0, 1, 2, 1, 1, 1, 1, 2...
## $ `5/14/20` <dbl> 0, 3, 8, 1, 1, 0, 1, 8, 3, 22, 0, 1, 3, 2, 2, 1, 1, 2...
## $ `5/15/20` <dbl> 0, 3, 8, 1, 1, 0, 1, 9, 3, 22, 0, 1, 3, 2, 2, 1, 1, 2...
## $ `5/16/20` <dbl> 0, 3, 8, 1, 1, 0, 1, 9, 3, 22, 0, 1, 3, 2, 2, 1, 1, 2...
## $ `5/17/20` <dbl> 0, 3, 8, 1, 1, 1, 1, 9, 3, 22, 0, 1, 3, 2, 2, 1, 1, 2...
## $ `5/18/20` <dbl> 0, 3, 8, 1, 1, 1, 1, 10, 3, 22, 0, 1, 3, 2, 2, 1, 1, ...
## $ `5/19/20` <dbl> 0, 3, 8, 1, 1, 1, 1, 10, 3, 22, 0, 1, 3, 2, 2, 1, 1, ...
## $ `5/20/20` <dbl> 0, 3, 8, 1, 1, 1, 1, 11, 3, 23, 0, 1, 3, 2, 2, 1, 1, ...
## $ `5/21/20` <dbl> 0, 3, 8, 1, 1, 1, 1, 11, 3, 23, 0, 1, 3, 2, 2, 1, 1, ...
## $ `5/22/20` <dbl> 0, 3, 9, 1, 1, 1, 1, 11, 3, 23, 2, 1, 4, 2, 2, 1, 1, ...
## $ `5/23/20` <dbl> 0, 3, 9, 1, 1, 1, 1, 11, 3, 23, 2, 1, 4, 2, 2, 1, 1, ...
## $ `5/24/20` <dbl> 0, 3, 9, 1, 1, 1, 1, 12, 3, 23, 2, 1, 4, 2, 2, 1, 1, ...
## $ `5/25/20` <dbl> 0, 3, 9, 1, 1, 1, 3, 12, 3, 24, 2, 1, 4, 2, 2, 1, 1, ...
## $ `5/26/20` <dbl> 0, 3, 9, 1, 1, 1, 3, 13, 3, 24, 2, 1, 7, 2, 2, 1, 1, ...
## $ `5/27/20` <dbl> 0, 3, 9, 1, 1, 1, 3, 13, 3, 24, 2, 1, 7, 2, 2, 1, 1, ...
## $ `5/28/20` <dbl> 0, 3, 9, 1, 1, 1, 4, 15, 3, 24, 2, 1, 8, 2, 2, 1, 1, ...
## $ `5/29/20` <dbl> 0, 3, 9, 1, 1, 1, 4, 16, 3, 24, 3, 1, 8, 2, 2, 1, 1, ...
## $ `5/30/20` <dbl> 0, 4, 9, 1, 1, 1, 4, 17, 3, 25, 3, 1, 8, 2, 2, 1, 1, ...
## $ `5/31/20` <dbl> 0, 4, 9, 1, 1, 1, 5, 18, 3, 25, 3, 1, 8, 2, 2, 1, 1, ...
## $ `6/1/20` <dbl> 0, 5, 9, 1, 1, 1, 6, 18, 3, 25, 3, 1, 10, 2, 2, 1, 1,...
## $ `6/2/20` <dbl> 0, 5, 9, 1, 1, 1, 6, 18, 3, 26, 3, 1, 10, 2, 2, 1, 1,...
## $ `6/3/20` <dbl> 0, 5, 9, 1, 1, 1, 6, 18, 3, 26, 3, 1, 10, 2, 2, 1, 1,...
## $ `6/4/20` <dbl> 0, 5, 9, 1, 1, 1, 6, 18, 3, 26, 3, 1, 10, 2, 2, 1, 1,...
## $ `6/5/20` <dbl> 0, 5, 9, 1, 1, 1, 7, 21, 3, 26, 3, 1, 10, 2, 2, 1, 1,...
## $ `6/6/20` <dbl> 0, 5, 9, 1, 1, 1, 7, 22, 3, 26, 4, 2, 10, 2, 2, 1, 1,...
## $ `6/7/20` <dbl> 0, 5, 9, 1, 1, 1, 7, 22, 3, 26, 4, 2, 10, 2, 2, 1, 1,...
## $ `6/8/20` <dbl> 0, 5, 9, 1, 1, 1, 8, 24, 3, 26, 4, 2, 10, 3, 2, 1, 1,...
## $ `6/9/20` <dbl> 0, 5, 9, 1, 1, 1, 8, 24, 3, 26, 4, 2, 10, 3, 2, 1, 1,...
## $ `6/10/20` <dbl> 0, 6, 9, 1, 1, 1, 8, 24, 3, 26, 4, 2, 11, 3, 2, 1, 1,...
## $ `6/11/20` <dbl> 0, 6, 9, 1, 1, 1, 8, 25, 3, 26, 4, 2, 11, 3, 2, 1, 1,...
## $ `6/12/20` <dbl> 0, 6, 9, 1, 1, 1, 8, 25, 3, 26, 5, 2, 11, 3, 2, 1, 1,...
## $ `6/13/20` <dbl> 0, 6, 9, 1, 1, 1, 8, 25, 3, 26, 5, 2, 11, 3, 2, 1, 1,...
## $ `6/14/20` <dbl> 0, 6, 9, 1, 1, 1, 8, 25, 3, 26, 5, 2, 11, 3, 2, 1, 1,...
## $ `6/15/20` <dbl> 0, 6, 9, 1, 1, 1, 9, 25, 3, 26, 5, 2, 11, 3, 2, 1, 1,...
## $ `6/16/20` <dbl> 0, 7, 9, 1, 1, 1, 9, 25, 4, 26, 5, 2, 11, 3, 2, 1, 1,...
## $ `6/17/20` <dbl> 0, 7, 9, 1, 1, 1, 9, 25, 4, 26, 5, 2, 11, 3, 2, 1, 1,...
## $ `6/18/20` <dbl> 0, 8, 9, 1, 1, 1, 9, 25, 4, 26, 5, 3, 11, 4, 2, 1, 1,...
## $ `6/19/20` <dbl> 0, 8, 9, 1, 1, 1, 10, 26, 4, 27, 6, 3, 11, 4, 2, 1, 1...
## $ `6/20/20` <dbl> 0, 9, 9, 1, 1, 1, 10, 26, 4, 27, 6, 3, 12, 4, 2, 1, 1...
## $ `6/21/20` <dbl> 0, 9, 9, 1, 1, 1, 10, 26, 4, 27, 6, 3, 12, 4, 2, 1, 1...
## $ `6/22/20` <dbl> 0, 9, 9, 1, 1, 1, 10, 26, 4, 27, 6, 3, 12, 4, 2, 1, 1...
## $ `6/23/20` <dbl> 0, 9, 9, 1, 1, 1, 10, 27, 5, 27, 7, 3, 12, 4, 2, 1, 1...
## $ `6/24/20` <dbl> 0, 11, 9, 1, 1, 1, 10, 27, 5, 27, 7, 3, 12, 5, 2, 1, ...
## $ `6/25/20` <dbl> 0, 11, 9, 1, 1, 1, 10, 27, 5, 27, 7, 3, 12, 5, 2, 1, ...
## $ `6/26/20` <dbl> 0, 11, 9, 1, 1, 1, 10, 27, 5, 27, 7, 3, 12, 5, 2, 1, ...
## $ `6/27/20` <dbl> 0, 12, 10, 1, 1, 1, 10, 27, 5, 27, 7, 3, 12, 5, 2, 1,...
## $ `6/28/20` <dbl> 0, 12, 10, 1, 1, 1, 10, 27, 5, 27, 7, 3, 12, 5, 2, 1,...
## $ `6/29/20` <dbl> 0, 12, 10, 1, 1, 1, 10, 27, 5, 27, 7, 3, 12, 5, 2, 1,...
## $ `6/30/20` <dbl> 0, 12, 10, 1, 1, 1, 10, 27, 5, 27, 7, 3, 12, 5, 2, 1,...
## $ `7/1/20` <dbl> 0, 12, 10, 1, 1, 1, 10, 27, 5, 27, 7, 3, 12, 5, 2, 1,...
## $ `7/2/20` <dbl> 0, 13, 10, 1, 1, 1, 10, 27, 5, 27, 7, 3, 12, 6, 2, 1,...
## $ `7/3/20` <dbl> 0, 13, 10, 2, 1, 1, 10, 28, 5, 27, 7, 3, 12, 6, 2, 1,...
## $ `7/4/20` <dbl> 0, 13, 10, 2, 1, 1, 11, 28, 5, 27, 7, 3, 12, 6, 2, 1,...
## $ `7/5/20` <dbl> 0, 13, 10, 2, 1, 1, 11, 28, 5, 27, 7, 3, 12, 6, 2, 1,...
## $ `7/6/20` <dbl> 0, 13, 10, 2, 1, 1, 11, 28, 5, 27, 7, 3, 12, 6, 2, 1,...
## $ `7/7/20` <dbl> 0, 13, 10, 2, 1, 1, 11, 28, 5, 27, 7, 3, 12, 6, 2, 1,...
## $ `7/8/20` <dbl> 0, 13, 10, 2, 1, 1, 11, 28, 5, 27, 7, 3, 12, 6, 2, 1,...
## $ `7/9/20` <dbl> 0, 14, 11, 2, 1, 1, 11, 28, 5, 27, 7, 3, 12, 6, 2, 1,...
## $ `7/10/20` <dbl> 0, 15, 12, 2, 1, 1, 11, 29, 5, 27, 7, 3, 12, 6, 2, 1,...
## $ `7/11/20` <dbl> 0, 15, 12, 2, 1, 1, 11, 29, 5, 27, 7, 3, 12, 6, 2, 1,...
## $ `7/12/20` <dbl> 0, 16, 12, 2, 1, 1, 11, 29, 5, 30, 7, 3, 12, 6, 2, 1,...
## $ `7/13/20` <dbl> 0, 16, 12, 2, 1, 1, 11, 29, 5, 30, 7, 3, 12, 6, 2, 1,...
## $ `7/14/20` <dbl> 0, 18, 12, 3, 2, 1, 11, 31, 6, 32, 7, 4, 12, 6, 2, 1,...
## $ `7/15/20` <dbl> 0, 19, 13, 3, 2, 1, 11, 31, 6, 32, 7, 4, 12, 6, 2, 1,...
## $ `7/16/20` <dbl> 0, 20, 14, 3, 2, 1, 11, 32, 6, 32, 7, 4, 12, 6, 2, 1,...
## $ `7/17/20` <dbl> 0, 21, 14, 3, 2, 1, 11, 33, 6, 32, 7, 4, 12, 6, 2, 1,...
## $ `7/18/20` <dbl> 0, 21, 15, 3, 2, 1, 11, 33, 6, 33, 7, 4, 12, 7, 2, 1,...
## $ `7/19/20` <dbl> 0, 21, 15, 3, 2, 1, 11, 33, 6, 33, 7, 4, 12, 7, 2, 1,...
## $ `7/20/20` <dbl> 0, 21, 15, 4, 2, 1, 11, 33, 6, 33, 7, 4, 12, 7, 2, 1,...
## $ `7/21/20` <dbl> 0, 21, 16, 4, 2, 1, 11, 34, 6, 33, 7, 4, 12, 7, 2, 1,...
## $ `7/22/20` <dbl> 0, 21, 16, 4, 2, 1, 12, 34, 6, 34, 7, 5, 12, 8, 2, 1,...
## $ `7/23/20` <dbl> 0, 21, 17, 4, 2, 1, 12, 35, 6, 34, 7, 5, 12, 9, 2, 1,...
## $ `7/24/20` <dbl> 0, 21, 18, 4, 2, 1, 12, 35, 6, 37, 8, 5, 12, 9, 3, 1,...
## $ `7/25/20` <dbl> 0, 21, 18, 4, 2, 1, 12, 35, 6, 37, 8, 5, 12, 9, 3, 1,...
## $ `7/26/20` <dbl> 0, 21, 18, 4, 2, 1, 12, 35, 6, 37, 8, 5, 12, 9, 3, 1,...
## $ `7/27/20` <dbl> 0, 21, 18, 4, 2, 1, 12, 36, 6, 38, 8, 6, 12, 9, 4, 1,...
## $ `7/28/20` <dbl> 0, 21, 18, 4, 2, 1, 12, 36, 6, 38, 8, 6, 12, 9, 4, 1,...
## $ `7/29/20` <dbl> 0, 21, 21, 4, 2, 3, 12, 36, 6, 38, 8, 6, 12, 9, 4, 1,...
## $ `7/30/20` <dbl> 0, 21, 21, 5, 2, 3, 12, 36, 8, 38, 8, 6, 12, 9, 5, 1,...
## $ `7/31/20` <dbl> 0, 21, 22, 5, 2, 3, 12, 36, 9, 38, 8, 6, 12, 9, 5, 1,...
## $ `8/1/20` <dbl> 0, 21, 22, 5, 2, 3, 12, 36, 9, 38, 8, 6, 12, 9, 5, 1,...
## $ `8/2/20` <dbl> 0, 21, 23, 5, 3, 3, 12, 36, 9, 38, 8, 7, 12, 9, 5, 1,...
## $ `8/3/20` <dbl> 0, 21, 24, 5, 3, 3, 12, 36, 9, 38, 8, 7, 12, 9, 5, 1,...
## $ `8/4/20` <dbl> 0, 21, 24, 5, 3, 3, 12, 36, 12, 38, 8, 7, 12, 9, 5, 1...
## $ `8/5/20` <dbl> 0, 22, 24, 5, 4, 3, 12, 36, 13, 38, 8, 7, 12, 9, 5, 1...
## $ `8/6/20` <dbl> 0, 22, 25, 5, 4, 3, 12, 36, 13, 38, 8, 7, 12, 9, 5, 1...
## $ `8/7/20` <dbl> 0, 22, 25, 5, 4, 3, 12, 36, 13, 38, 8, 8, 12, 9, 5, 1...
## $ `8/8/20` <dbl> 0, 22, 26, 5, 5, 4, 12, 37, 13, 38, 8, 8, 12, 9, 5, 1...
## $ `8/9/20` <dbl> 0, 22, 27, 5, 5, 4, 12, 37, 14, 38, 8, 8, 12, 9, 5, 1...
## $ `8/10/20` <dbl> 0, 22, 28, 5, 5, 4, 12, 37, 17, 38, 9, 9, 12, 10, 5, ...
## $ `8/11/20` <dbl> 0, 23, 32, 6, 6, 5, 12, 37, 19, 38, 9, 12, 12, 10, 5,...
## $ `8/12/20` <dbl> 0, 23, 32, 6, 6, 5, 12, 37, 20, 38, 9, 12, 12, 10, 5,...
## $ `8/13/20` <dbl> 0, 23, 32, 6, 6, 5, 12, 37, 20, 38, 9, 12, 12, 10, 5,...
## $ `8/14/20` <dbl> 0, 23, 32, 6, 6, 5, 12, 37, 20, 38, 9, 12, 12, 10, 5,...
## $ `8/15/20` <dbl> 0, 23, 32, 6, 6, 5, 12, 37, 20, 38, 9, 12, 12, 10, 5,...
## $ `8/16/20` <dbl> 0, 23, 32, 6, 6, 5, 12, 37, 20, 38, 9, 12, 12, 10, 5,...
## $ `8/17/20` <dbl> 0, 23, 32, 6, 6, 5, 14, 37, 23, 38, 9, 12, 12, 10, 5,...
## $ `8/18/20` <dbl> 0, 23, 33, 6, 6, 5, 14, 37, 25, 38, 9, 12, 12, 10, 5,...
## $ `8/19/20` <dbl> 0, 23, 33, 7, 6, 5, 14, 37, 25, 38, 9, 12, 12, 10, 5,...
## $ `8/20/20` <dbl> 0, 23, 34, 7, 6, 5, 14, 37, 25, 38, 10, 12, 12, 10, 5...
## $ `8/21/20` <dbl> 0, 23, 35, 7, 6, 6, 14, 37, 25, 38, 10, 13, 12, 10, 5...
## $ `8/22/20` <dbl> 0, 23, 35, 7, 6, 6, 14, 37, 25, 38, 10, 13, 12, 10, 6...
## $ `8/23/20` <dbl> 0, 23, 35, 7, 6, 6, 14, 37, 25, 38, 10, 13, 12, 10, 6...
## $ `8/24/20` <dbl> 0, 23, 35, 7, 6, 6, 14, 37, 27, 38, 10, 13, 12, 11, 6...
## $ `8/25/20` <dbl> 0, 23, 35, 7, 6, 6, 14, 37, 28, 39, 10, 13, 12, 11, 6...
## $ `8/26/20` <dbl> 0, 23, 36, 7, 6, 7, 14, 37, 28, 39, 10, 13, 12, 12, 6...
## $ `8/27/20` <dbl> 0, 23, 37, 7, 6, 7, 14, 37, 30, 39, 12, 13, 12, 13, 6...
## $ `8/28/20` <dbl> 0, 23, 39, 7, 6, 9, 14, 37, 32, 40, 12, 13, 12, 13, 6...
## $ `8/29/20` <dbl> 0, 23, 40, 7, 7, 9, 14, 37, 35, 40, 12, 13, 12, 14, 6...
## $ `8/30/20` <dbl> 0, 23, 40, 7, 7, 10, 14, 37, 35, 40, 12, 13, 12, 14, ...
## $ `8/31/20` <dbl> 0, 23, 42, 7, 8, 11, 14, 37, 36, 40, 12, 13, 12, 14, ...
## $ `9/1/20` <dbl> 0, 24, 42, 7, 8, 11, 14, 37, 38, 40, 12, 13, 12, 14, ...
## Observations: 715,680
## Variables: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumDeaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
# STEP 3: Explore the cases and deaths by county (can be repeated for other counties)
casesDeathsByCounty(useDate=maxDate,
inclStates=c("FL", "GA", "SC", "AL", "MS"),
caseData=rawUSAFacts$cases,
deathData=rawUSAFacts$deaths,
highCaseAmount=80000,
highDeathAmount=2000
)
# STEP 4: Create county-level clusters (k-means, 5 clusters, minimum county population 25k)
clust_20200903_new <- prepClusterCounties(burdenFile=burden_20200903_new,
maxDate=maxDate,
minPop=25000,
hierarchical=FALSE,
minShape=3,
ratioDeathvsCase = 5,
ratioTotalvsShape = 0.5,
minDeath=100,
minCase=5000,
nCenters=5,
testCenters=1:25,
iter.max=20,
nstart=10,
seed=2009081450
)
## # A tibble: 1 x 6
## cpm_mean_is0 dpm_mean_is0 cpm_mean_lt100 dpm_mean_lt100 cpm_mean_lt5000
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.0352 0 0.266 0.140
## # ... with 1 more variable: dpm_mean_lt5000 <dbl>
##
## Cluster means and counts
## 1 2 3 4 5
## . 501.00 473.00 53.00 199.00 365.00
## totalCases 0.69 0.32 1.23 1.14 0.52
## totalDeaths 1.36 0.36 9.67 4.76 1.86
## cases_3 0.01 0.02 0.08 0.02 0.02
## deaths_3 0.05 0.08 0.12 0.05 0.09
## cases_4 0.04 0.06 0.39 0.14 0.17
## deaths_4 0.35 0.54 2.07 0.85 1.45
## cases_5 0.05 0.08 0.17 0.14 0.17
## deaths_5 0.29 0.34 1.29 0.99 1.72
## cases_6 0.13 0.11 0.10 0.15 0.12
## deaths_6 0.36 0.52 0.62 0.70 0.73
## cases_7 0.39 0.27 0.14 0.32 0.24
## deaths_7 1.25 0.69 0.46 1.09 0.45
## cases_8 0.38 0.32 0.12 0.24 0.25
## deaths_8 2.61 0.58 0.43 1.32 0.51
# STEP 5: Extract the clusters from the clustering object
clustVec_county_20200903_new <- clust_20200903_new$objCluster$objCluster$cluster
# STEP 6: Assess the quality of the new clusters
helperACC_county_20200903_new <- helperAssessCountyClusters(clustVec_county_20200903_new,
dfPop=clust_20200903_new$countyFiltered,
dfBurden=clust_20200903_new$countyFiltered,
thruLabel="Sep 3, 2020",
plotsTogether=TRUE,
orderCluster=TRUE
)
##
## Recency is defined as 2020-08-02 through current
##
## Recency is defined as 2020-08-02 through current
# STEP 7: Create a plot of cumulative burden by cluster
helperACC_county_20200903_new %>%
select(cluster, date, pop, cases, deaths) %>%
group_by(cluster, date) %>%
summarize_if(is.numeric, sum, na.rm=TRUE) %>%
arrange(date) %>%
mutate(cpmcum=cumsum(cases)*1000000/pop, dpmcum=cumsum(deaths)*1000000/pop) %>%
ungroup() %>%
select(cluster, date, cases=cpmcum, deaths=dpmcum) %>%
pivot_longer(-c(cluster, date)) %>%
ggplot(aes(x=date, y=value, color=cluster)) +
geom_line(size=1) +
geom_text(data=~filter(., date==max(date)),
aes(x=date+lubridate::days(2), label=round(value)),
size=3,
hjust=0
) +
labs(x="", title="Cumulative burden per million people by segment", y="") +
facet_wrap(~c("cases"="Cases per million", "deaths"="Deaths per million")[name], scales="free_y") +
scale_x_date(date_breaks="1 months", date_labels="%b", expand=expand_scale(c(0, 0.1)))
# STEP 8: Add back clusters not used for analysis (code 999) and associated disease data
clusterStateData_20200903_new <- helperMakeClusterStateData(helperACC_county_20200903_new,
dfBurden=clust_20200903_new$countyDailyPerCapita,
orderCluster=TRUE
)
## Joining, by = "fipsCounty"
## Joining, by = "fipsCounty"
# STEP 9: Run an example state-level summary (can expand to other states)
stateCountySummary(states=c("MN", "ND", "SD", "WI"),
df=changeOrderLabel(clusterStateData_20200903_new, grpVars="fipsCounty"),
keyDate=maxDate,
showQuadrants=TRUE,
showCumulative=TRUE,
facetCumulativeByState = TRUE,
showAllFactorLevels = TRUE
)
Next steps are to continue fleshing out the analysis component, including enabling the load of new data with existing segments and comparison of lags between cases and deaths:
# Get the clusters used previously in burdenUS
oldClusters <- burdenUS %>%
count(state, cluster) %>%
select(-n)
oldcvec <- pull(oldClusters, cluster)
names(oldcvec) <- oldClusters$state
# File read in from previous data
burden_20200903 <- readUSAFacts(
caseFile="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20200903.csv",
deathFile="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20200903.csv",
stateClusters=oldcvec
)
# Comparison of components
burden_20200903 %>%
anti_join(select(burdenUS, countyFIPS)) %>%
count(state, county)
table(rowSums((burden_20200903 %>% semi_join(select(burdenUS, countyFIPS)) != burdenUS)))
plotBurdenData(burden_20200903, maxDate="2020-09-01", minPop=10000)
countyFiltered_20200903 <- createCountyFiltered(burden_20200903, maxDate="2020-08-31", minPop=25000)
# File read in from previous data
burden_20200917 <- readUSAFacts(
caseFile="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20200917.csv",
deathFile="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20200917.csv",
stateClusters=oldcvec
)
# Comparison of total burden by date vs. previous file
bind_rows(burden_20200903, burden_20200917, .id="source") %>%
mutate(source=factor(case_when(source==1 ~ "2020-09-03", source==2 ~ "2020-09-17", TRUE ~ "Unknown"),
levels=c("2020-09-17", "2020-09-03", "Unknown")
)
) %>%
group_by(source, date) %>%
summarize(cumDeaths=sum(cumDeaths), cumCases=sum(cumCases)) %>%
pivot_longer(-c(source, date)) %>%
ggplot(aes(x=date, y=value/1000, group=source, color=source)) +
geom_line() +
facet_wrap(~c("cumCases"="Cases", "cumDeaths"="Deaths")[name], scales="free_y") +
scale_x_date(date_breaks="1 months", date_labels="%m") +
labs(y="Burden (000s)", title="US National Coronavirus Burden by Source")
# Plot burden data through 15-SEP-2020
plotBurdenData(burden_20200917, maxDate="2020-09-15", minPop=10000)
# Create filtered county data for 25k+ population using data through 15-SEP-2020
countyFiltered_20200917 <- createCountyFiltered(burden_20200917, maxDate="2020-09-15", minPop=25000)
helper_test_20200917 <- helperAssessCountyClusters(countyCluster_km_test$objCluster$cluster,
dfPop=countyFiltered_20200917,
dfBurden=countyFiltered_20200917,
thruLabel="Sep 15, 2020",
plotsTogether=TRUE,
showMap=TRUE,
clusterPlotsTogether=TRUE,
orderCluster=TRUE
)
clusterNorm <- helper_test_20200917 %>%
group_by(cluster, date) %>%
summarize(cpm7=sum(pop*cpm7)/sum(pop), dpm7=sum(pop*dpm7)/sum(pop), pop=sum(pop)) %>%
group_by(cluster) %>%
mutate(caseNorm=100*cpm7/max(cpm7, na.rm=TRUE), deathNorm=100*dpm7/max(dpm7, na.rm=TRUE)) %>%
ungroup()
clusterNorm %>%
select(cluster, date, caseNorm, deathNorm) %>%
pivot_longer(-c(cluster, date)) %>%
filter(!is.na(value)) %>%
ggplot(aes(x=date,
y=value,
color=c("deathNorm"="Normalized Death", "caseNorm"="Normalized Cases")[name],
group=name
)
) +
geom_line() +
facet_wrap(~cluster) +
scale_color_discrete("Metric") +
scale_x_date(date_breaks="1 months", date_labels="%m") +
labs(x="2020",
y="Normalized Burden",
title="Burden by Segment",
subtitle="Normalized (100 is segment maximum for metric)"
)
clusterNorm %>%
select(cluster, date, caseNorm, deathNorm) %>%
pivot_longer(-c(cluster, date)) %>%
filter(!is.na(value), date>="2020-03-01", date<="2020-05-31") %>%
ggplot(aes(x=date,
y=value,
color=c("deathNorm"="Normalized Death", "caseNorm"="Normalized Cases")[name],
group=name
)
) +
geom_line() +
facet_wrap(~cluster) +
scale_color_discrete("Metric") +
scale_x_date(date_breaks="1 months", date_labels="%m") +
labs(x="2020",
y="Normalized Burden",
title="Burden by Segment (March 1 - May 31)",
subtitle="Normalized (100 is segment maximum for metric)"
)
clusterMarchMay <- clusterNorm %>%
select(cluster, date, cpm7, dpm7) %>%
filter(!is.na(cpm7), !is.na(dpm7), date>="2020-03-01", date<="2020-05-31")
corrLagEarly <- map_dfr(.x=0:30, .f=helperCorrel)
corrLagEarly %>%
ggplot(aes(x=lag, y=corr)) +
geom_line(aes(group=cluster, color=cluster)) +
facet_wrap(~cluster)
bestLag <- corrLagEarly %>%
group_by(cluster) %>%
filter(corr==max(corr)) %>%
ungroup()
bestLag
clusterMarchMay %>%
inner_join(bestLag, by="cluster") %>%
group_by(cluster) %>%
arrange(date) %>%
mutate(cpmlag=lag(cpm7, min(lag))) %>%
filter(!is.na(cpmlag)) %>%
mutate(ratCur=dpm7/cpmlag, ratCum=cumsum(dpm7)/cumsum(cpmlag)) %>%
ggplot(aes(x=date, group=cluster, color=cluster)) +
geom_line(aes(y=ratCum)) +
geom_line(aes(y=ratCur), lty=2) +
labs(x="",
y="Cumulative Death vs. Case Ratio",
title="Deaths vs. Cases with Lag (Early Pandemic)",
subtitle="Solid line is cumulative, dashed line is current time period"
) +
geom_text(data=~filter(., date==max(date)),
aes(label=paste0(round(100*ratCum, 1), "%"), y=ratCum+0.02)
) +
facet_wrap(~cluster)
cNorm_002 <- helperMakeNormData(helper_test_20200917)
# Create for early pandemic
lagData_early_002 <- helperTestLags(cNorm_002, minDate="2020-03-01", maxDate="2020-05-31")
# Create for late pandemic
lagData_late_002 <- helperTestLags(cNorm_002, minDate="2020-06-01", maxDate="2020-08-31")
# Extract a list of the heaviest hit counties with population 1000k+
keyCounties <- helper_test_20200917 %>%
mutate(state=str_pad(state, width=5, side="left", pad="0")) %>%
filter(pop >= 100000) %>%
group_by(state, cluster) %>%
summarize(dpm=sum(dpm), pop=mean(pop)) %>%
group_by(cluster) %>%
top_n(n=3, wt=dpm) %>%
ungroup() %>%
arrange(cluster, -dpm) %>%
inner_join(select(usmap::countypop, -pop_2015), by=c("state"="fips")) %>%
mutate(countyName=paste0(cluster, " - ",
stringr::str_replace(county, "County|Parish", "("),
abbr,
")"
)
) %>%
select(-abbr, -county)
# Keep only the key counties
cNorm_keyCounties <- helper_test_20200917 %>%
mutate(state=str_pad(state, width=5, side="left", pad="0")) %>%
inner_join(select(keyCounties, state, countyName), by=c("state"="state")) %>%
helperMakeNormData(aggBy=c("countyName", "state", "cluster"))
# Create lags for early pandemic
lagData_early_keycounties <- helperTestLags(cNorm_keyCounties,
minDate="2020-03-01",
maxDate="2020-05-31",
aggBy=c("countyName", "state", "cluster"),
maxRatio=0.25
)
# Create lags for late pandemic
lagData_late_keycounties <- helperTestLags(cNorm_keyCounties,
minDate="2020-06-01",
maxDate="2020-08-31",
aggBy=c("countyName", "state", "cluster"),
maxRatio=0.25
)
# Run for top-12 counties early
exploreTopCounties(helper_test_20200917, minDate="2020-03-01", maxDate="2020-05-31", nVar="pop", nKey=12)
# Run for top-12 counties late
exploreTopCounties(helper_test_20200917, minDate="2020-06-15", maxDate="2020-09-15", nVar="pop", nKey=12)